refactor: handle file

2024-10-30 09:20:44 +00:00 · 2023-03-18 06:05:02 +00:00 · 2023-03-18 06:05:02 +00:00 · 71e9489ff0
commit 71e9489ff0
parent 073b22927a
13 changed files with 306 additions and 194 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,4 +4,6 @@ __pycache__/
 .env
 image/
 audio/
 video/
 dataframe/
--- a/16
+++ b/16
@ -0,0 +1,16 @@
 FROM nvidia/cuda:11.7.0-runtime-ubuntu20.04
 WORKDIR /app/
 RUN \
  apt-get update && \
  apt-get install -y python3 python3-pip
 RUN apt-get install uvicorn -y
 RUN pip install --upgrade pip
 COPY requirements.txt .
 RUN pip install -r requirements.txt
 COPY . .
 ENTRYPOINT ["sleep", "infinity"]
 # ENTRYPOINT ["python3", "-m", "uvicorn", "main:app", "--reload", "--host=0.0.0.0", "--port=8000"]
--- a/README.md
+++ b/README.md
@ -0,0 +1,38 @@
 # Usage
 ### S3
 1. Create a bucket.
 2. Turn off the "Block all public access" setting for the bucket. ![image](assets/block_public_access.png)
 3. Add the following text to Bucket Policy.
   ```json
   {
     "Version": "2012-10-17",
     "Statement": [
       {
         "Sid": "AllowPublicRead",
         "Effect": "Allow",
         "Principal": {
           "AWS": "*"
         },
         "Action": "s3:GetObject",
         "Resource": "arn:aws:s3:::{your-bucket-name}/*"
       }
     ]
   }
   ```
 ## Environment
 You must need this environments.
 ```
 OPENAI_API_KEY
 ```
 You need this environments.
 ```
 serpapi: SERPAPI_API_KEY
 bing-search: BING_SEARCH_URL, BING_SUBSCRIPTION_KEY
 ```
--- a/agent.py
+++ b/agent.py
@ -0,0 +1,79 @@
 from typing import Dict, List, Tuple
 from llm import ChatOpenAI
 from langchain.agents import load_tools
 from langchain.agents.agent import AgentExecutor
 from langchain.agents.tools import Tool
 from langchain.agents.initialize import initialize_agent
 from langchain.chains.conversation.memory import ConversationBufferMemory
 from utils import AWESOMEGPT_PREFIX, AWESOMEGPT_SUFFIX
 from tools.cpu import (
    RequestsGet,
    WineDB,
    ExitConversation,
 )
 from tools.gpu import (
    ImageEditing,
    InstructPix2Pix,
    Text2Image,
    ImageCaptioning,
    VisualQuestionAnswering,
 )
 from handler import Handler, FileType
 def get_agent() -> Tuple[AgentExecutor, Handler]:
    print("Initializing AwesomeGPT")
    llm = ChatOpenAI(temperature=0)
    tools = [
        *load_tools(
            ["python_repl", "terminal", "serpapi", "wikipedia", "bing-search"],
            llm=llm,
        ),
    ]
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
    models = {
        "RequestsGet": RequestsGet(),
        "WineDB": WineDB(),
        "ExitConversation": ExitConversation(memory),
        "Text2Image": Text2Image("cuda"),
        "ImageEditing": ImageEditing("cuda"),
        "InstructPix2Pix": InstructPix2Pix("cuda"),
        "VisualQuestionAnswering": VisualQuestionAnswering("cuda"),
    }
    for _, instance in models.items():
        for e in dir(instance):
            if e.startswith("inference"):
                func = getattr(instance, e)
                tools.append(
                    Tool(name=func.name, description=func.description, func=func)
                )
    handle_models: Dict[FileType, str] = {
        FileType.IMAGE: ImageCaptioning("cuda"),
    }
    handler = Handler(
        handle_func={
            file_type: model.inference for file_type, model in handle_models.items()
        }
    )
    return (
        initialize_agent(
            tools,
            llm,
            agent="chat-conversational-react-description",
            verbose=True,
            memory=memory,
            agent_kwargs={
                "system_message": AWESOMEGPT_PREFIX,
                "human_message": AWESOMEGPT_SUFFIX,
            },
        ),
        handler,
    )
--- a/assets/block_public_access.png
+++ b/assets/block_public_access.png
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,16 @@
 version: "3"
 services:
  awesomegpt:
    build:
      dockerfile: Dockerfile
      context: .
    env_file:
      - .env
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              device_ids: ["3"] # You can choose which GPU to use
              capabilities: [gpu]
--- a/file.py
+++ b/file.py
@ -1,87 +0,0 @@
 import os
 import requests
 import uuid
 from typing import Callable
 from enum import Enum
 from PIL import Image
 import pandas as pd
 from utils import IMAGE_PROMPT, DATAFRAME_PROMPT
 from tools import IMAGE_MODEL
 class FileType(Enum):
    IMAGE = "image"
    AUDIO = "audio"
    VIDEO = "video"
    DATAFRAME = "dataframe"
    UNKNOWN = "unknown"
 def handle(file_name: str) -> Callable:
    """
    Parse file type from file name (ex. image, audio, video, dataframe, etc.)
    """
    file_name = file_name.split("?")[0]
    if file_name.endswith(".png") or file_name.endswith(".jpg"):
        return handle_image
    elif file_name.endswith(".mp3") or file_name.endswith(".wav"):
        return handle_audio
    elif file_name.endswith(".mp4") or file_name.endswith(".avi"):
        return handle_video
    elif file_name.endswith(".csv"):
        return handle_dataframe
    else:
        return handle_unknown
 def handle_image(i: int, file: str) -> str:
    img_data = requests.get(file).content
    filename = os.path.join("image", str(uuid.uuid4())[0:8] + ".png")
    with open(filename, "wb") as f:
        size = f.write(img_data)
    print(f"Inputs: {file} ({size//1000}MB)  => {filename}")
    img = Image.open(filename)
    width, height = img.size
    ratio = min(512 / width, 512 / height)
    width_new, height_new = (round(width * ratio), round(height * ratio))
    img = img.resize((width_new, height_new))
    img = img.convert("RGB")
    img.save(filename, "PNG")
    print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
    try:
        description = IMAGE_MODEL.inference(filename)
    except Exception as e:
        return {"text": "image upload", "response": str(e), "additional": []}
    return IMAGE_PROMPT.format(i=i, filename=filename, description=description)
 def handle_audio(i: int, file: str) -> str:
    return ""
 def handle_video(i: int, file: str) -> str:
    return ""
 def handle_dataframe(i: int, file: str) -> str:
    content = requests.get(file).content
    filename = os.path.join("dataframe/", str(uuid.uuid4())[0:8] + ".csv")
    with open(filename, "wb") as f:
        size = f.write(content)
    print(f"Inputs: {file} ({size//1000}MB)  => {filename}")
    df = pd.read_csv(filename)
    try:
        description = str(df.describe())
    except Exception as e:
        return {"text": "image upload", "response": str(e), "additional": []}
    return DATAFRAME_PROMPT.format(i=i, filename=filename, description=description)
 def handle_unknown(i: int, file: str) -> str:
    return ""
--- a/handler.py
+++ b/handler.py
@ -0,0 +1,89 @@
 import os
 import requests
 import uuid
 from typing import Callable, Dict
 from enum import Enum
 from PIL import Image
 import pandas as pd
 from utils import IMAGE_PROMPT, DATAFRAME_PROMPT
 class FileType(Enum):
    IMAGE = "image"
    AUDIO = "audio"
    VIDEO = "video"
    DATAFRAME = "dataframe"
    UNKNOWN = "unknown"
 class Handler:
    def __init__(self, handle_func: Dict[FileType, Callable]):
        self.handle_func = handle_func
    def handle(self, i: int, file_name: str) -> str:
        """
        Parse file type from file name (ex. image, audio, video, dataframe, etc.)
        """
        file_type = file_name.split("?")[0]
        if file_type.endswith(".png") or file_type.endswith(".jpg"):
            return self.handle_image(i, file_name)
        elif file_type.endswith(".mp3") or file_type.endswith(".wav"):
            return self.handle_audio(i, file_name)
        elif file_type.endswith(".mp4") or file_type.endswith(".avi"):
            return self.handle_video(i, file_name)
        elif file_type.endswith(".csv"):
            return self.handle_dataframe(i, file_name)
        else:
            return self.handle_unknown(i, file_name)
    def handle_image(self, i: int, remote_filename: str) -> str:
        img_data = requests.get(remote_filename).content
        local_filename = os.path.join("image", str(uuid.uuid4())[0:8] + ".png")
        with open(local_filename, "wb") as f:
            size = f.write(img_data)
        print(f"Inputs: {remote_filename} ({size//1000}MB)  => {local_filename}")
        img = Image.open(local_filename)
        width, height = img.size
        ratio = min(512 / width, 512 / height)
        width_new, height_new = (round(width * ratio), round(height * ratio))
        img = img.resize((width_new, height_new))
        img = img.convert("RGB")
        img.save(local_filename, "PNG")
        print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
        try:
            description = self.handle_func[FileType.IMAGE](local_filename)
        except Exception as e:
            return "Error: " + str(e)
        return IMAGE_PROMPT.format(
            i=i, filename=local_filename, description=description
        )
    def handle_audio(self, i: int, remote_filename: str) -> str:
        return ""
    def handle_video(self, i: int, remote_filename: str) -> str:
        return ""
    def handle_dataframe(self, i: int, remote_filename: str) -> str:
        content = requests.get(remote_filename).content
        local_filename = os.path.join("dataframe/", str(uuid.uuid4())[0:8] + ".csv")
        with open(local_filename, "wb") as f:
            size = f.write(content)
        print(f"Inputs: {remote_filename} ({size//1000}MB)  => {local_filename}")
        df = pd.read_csv(local_filename)
        try:
            description = str(df.describe())
        except Exception as e:
            return "Error: " + str(e)
        return DATAFRAME_PROMPT.format(
            i=i, filename=local_filename, description=description
        )
    def handle_unknown(self, i: int, file: str) -> str:
        return ""
--- a/main.py
+++ b/main.py
@ -1,115 +1,73 @@
-from typing import List, TypedDict, Callable
+from typing import List, TypedDict
 import re
 from langchain.agents import load_tools
 from langchain.agents.initialize import initialize_agent
 from langchain.agents.tools import Tool
 from fastapi import FastAPI
 from pydantic import BaseModel
 from dotenv import load_dotenv
 from s3 import upload
-from llm import ChatOpenAI
+from utils import ERROR_PROMPT
-from file import handle
+from agent import get_agent
 from utils import (
    AWESOMEGPT_PREFIX,
    AWESOMEGPT_SUFFIX,
    ERROR_PROMPT,
 )
 from tools import AWESOME_MODEL, memory
 load_dotenv()
 app = FastAPI()
-
+agent, handler = get_agent()
 print("Initializing AwesomeGPT")
 llm = ChatOpenAI(temperature=0)
 tools = [
    *load_tools(
        ["python_repl", "serpapi", "wikipedia", "bing-search"],
        llm=llm,
    ),
 ]
 for class_name, instance in AWESOME_MODEL.items():
    for e in dir(instance):
        if e.startswith("inference"):
            func = getattr(instance, e)
            tools.append(Tool(name=func.name, description=func.description, func=func))
 agent = initialize_agent(
    tools,
    llm,
    agent="chat-conversational-react-description",
    verbose=True,
    memory=memory,
    agent_kwargs={
        "system_message": AWESOMEGPT_PREFIX,
        "human_message": AWESOMEGPT_SUFFIX,
    },
 )
 class Request(BaseModel):
    text: str
    state: List[str]
    files: List[str]
    key: str
    query: str
    files: List[str]
 class Response(TypedDict):
    text: str
    response: str
-    additional: List[str]
+    files: List[str]
@app.get("/")
 async def index():
-    return {"message": "Hello World"}
+    return {"message": "Hello World. I'm AwesomeGPT."}
@app.post("/command")
 async def command(request: Request) -> Response:
-    text = request.text
+    query = request.query
    state = request.state
    files = request.files
    key = request.key
    print("=============== Running =============")
-    print("Inputs:", text, state, files)
+    print("Inputs:", query, files)
    # TODO - add state to memory (use key)
    print("======>Previous memory:\n %s" % agent.memory)
-    promptedText = ""
+    promptedQuery = ""
    import time
    for i, file in enumerate(files):
-        promptedText += handle(file)(i + 1, file)
+        promptedQuery += handler.handle(i + 1, file)
-    promptedText += text
+    promptedQuery += query
-    print("======>Prompted Text:\n %s" % promptedText)
+    print("======>Prompted Text:\n %s" % promptedQuery)
    try:
-        res = agent({"input": promptedText})
+        res = agent({"input": promptedQuery})
    except Exception as e:
        try:
            res = agent(
                {
-                    "input": ERROR_PROMPT.format(promptedText=promptedText, e=str(e)),
+                    "input": ERROR_PROMPT.format(promptedQuery=promptedQuery, e=str(e)),
                }
            )
        except Exception as e:
-            return {"text": promptedText, "response": str(e), "additional": []}
+            return {"response": str(e), "files": []}
    images = re.findall("(image/\S*png)", res["output"])
    dataframes = re.findall("(dataframe/\S*csv)", res["output"])
    return {
        "text": promptedText,
        "response": res["output"],
-        "additional": [upload(image) for image in images],
+        "files": [upload(image) for image in images]
        + [upload(dataframe) for dataframe in dataframes],
    }
--- a/requirements.txt
+++ b/requirements.txt
@ -6,6 +6,10 @@ langchain
 fastapi
 boto3
 llama_index
-torch==1.13.1+cu117
+torch
 transformers
-diffusers
+diffusers
 python_dotenv
 google-search-results
 psycopg2-binary
 wikipedia
--- a/tools/cpu.py
+++ b/tools/cpu.py
@ -1,22 +1,12 @@
 from langchain.chains.conversation.memory import ConversationBufferMemory
 from utils import prompts
 from env import settings
 from vfm import (
    ImageEditing,
    InstructPix2Pix,
    Text2Image,
    ImageCaptioning,
    VisualQuestionAnswering,
 )
 import requests
 from llama_index.readers.database import DatabaseReader
 from llama_index import GPTSimpleVectorIndex
-
+from langchain.memory.chat_memory import BaseChatMemory
 memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
 class RequestsGet:
@ -79,6 +69,9 @@ class WineDB:
 class ExitConversation:
    def __init__(self, memory: BaseChatMemory):
        self.memory = memory
    @prompts(
        name="exit_conversation",
        description="A tool to exit the conversation. "
@ -87,19 +80,5 @@ class ExitConversation:
    )
    def inference(self, query: str) -> str:
        """Run the tool."""
-        memory.chat_memory.messages = []
+        self.memory.chat_memory.messages = []
        return ""
 IMAGE_MODEL = ImageCaptioning("cuda:3")
 AWESOME_MODEL = {
    "RequestsGet": RequestsGet(),
    "WineDB": WineDB(),
    "ExitConversation": ExitConversation(),
    "Text2Image": Text2Image("cuda:3"),
    "ImageEditing": ImageEditing("cuda:3"),
    "InstructPix2Pix": InstructPix2Pix("cuda:3"),
    "VisualQuestionAnswering": VisualQuestionAnswering("cuda:3"),
 }
--- a/tools/gpu.py
+++ b/tools/gpu.py
--- a/utils.py
+++ b/utils.py
@ -8,29 +8,44 @@ from langchain.output_parsers.base import BaseOutputParser
 IMAGE_PROMPT = """
-{i}th image: provide a figure named {filename}. The description is: {description}.
+{i}th file: provide a figure named {filename}. The description is: {description}.
 """
 DATAFRAME_PROMPT = """
 {i}th dataframe: provide a dataframe named {filename}. The description is: {description}.
 """
 IMAGE_SUFFIX = """
 Please understand and answer the image based on this information. The image understanding is complete, so don't try to understand the image again.
 """
 AWESOMEGPT_PREFIX = """Awesome GPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Awesome GPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
-Awesome GPT is able to process and understand large amounts of text and images. As a language model, Awesome GPT can not directly read images, but it has a list of tools to finish different visual tasks. 
+AUDIO_PROMPT = """
 {i}th file: provide a audio named {filename}. The description is: {description}.
 Please understand and answer the audio based on this information. The audio understanding is complete, so don't try to understand the audio again.
 """
 VIDEO_PROMPT = """
 {i}th file: provide a video named {filename}. The description is: {description}.
 Please understand and answer the video based on this information. The video understanding is complete, so don't try to understand the video again.
 """
 DATAFRAME_PROMPT = """
 {i}th file: provide a dataframe named {filename}. The description is: {description}.
 You are able to use the dataframe to answer the question.
 You have to act like an data analyst who can do an effective analysis through dataframe.
 """
 AWESOMEGPT_PREFIX = """Awesome GPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. 
 Awesome GPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
 Awesome GPT is able to process and understand large amounts of various types of files. As a language model, Awesome GPT can not directly read various types of files, but it has a list of tools to finish different visual tasks. 
 Each image will have a file name formed as "image/xxx.png"
 Each audio will have a file name formed as "audio/xxx.mp3"
 Each video will have a file name formed as "video/xxx.mp4"
 Each dataframe will have a file name formed as "dataframe/xxx.csv"
-Awesome GPT can invoke different tools to indirectly understand pictures. When talking about images, Awesome GPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Awesome GPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Awesome GPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated.
+Awesome GPT can invoke different tools to indirectly understand files. When talking about files, Awesome GPT is very strict to the file name and will never fabricate nonexistent files. 
-
+When using tools to generate new files, Awesome GPT is also known that the file may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real file. 
-Human may provide new figures to Awesome GPT with a description. The description helps Awesome GPT to understand this image, but Awesome GPT should use tools to finish following tasks, rather than directly imagine from the description.
+Awesome GPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the file content and file name. It will remember to provide the file name from the last tool observation, if a new file is generated.
 Human may provide new figures to Awesome GPT with a description. The description helps Awesome GPT to understand this file, but Awesome GPT should use tools to finish following tasks, rather than directly imagine from the description.
 Overall, Awesome GPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics."""
@ -38,7 +53,8 @@ AWESOMEGPT_SUFFIX = """TOOLS
 ------
 Awesome GPT can ask the user to use tools to look up information that may be helpful in answering the users original question. 
 You are very strict to the filename correctness and will never fake a file name if it does not exist.
-You will remember to provide the image file name loyally if it's provided in the last tool observation.
+You will remember to provide the file name loyally if it's provided in the last tool observation.
 The tools the human can use are:
 {{tools}}
@ -51,10 +67,12 @@ Here is the user's input (remember to respond with a markdown code snippet of a
 {{{{input}}}}"""
-ERROR_PROMPT = "An error has occurred for the following text: \n{promptedText} Please explain this error.\n {e}"
+ERROR_PROMPT = "An error has occurred for the following text: \n{promptedQuery} Please explain this error.\n {e}"
 os.makedirs("image", exist_ok=True)
 os.makedirs("audio", exist_ok=True)
 os.makedirs("video", exist_ok=True)
 os.makedirs("dataframe", exist_ok=True)