diff --git a/.gitignore b/.gitignore index 95cfa6a..d4d6a87 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,6 @@ __pycache__/ .env image/ +audio/ +video/ dataframe/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..6042a46 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM nvidia/cuda:11.7.0-runtime-ubuntu20.04 +WORKDIR /app/ + +RUN \ + apt-get update && \ + apt-get install -y python3 python3-pip +RUN apt-get install uvicorn -y + +RUN pip install --upgrade pip +COPY requirements.txt . +RUN pip install -r requirements.txt + +COPY . . + +ENTRYPOINT ["sleep", "infinity"] +# ENTRYPOINT ["python3", "-m", "uvicorn", "main:app", "--reload", "--host=0.0.0.0", "--port=8000"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..a1766d5 --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# Usage + +### S3 + +1. Create a bucket. +2. Turn off the "Block all public access" setting for the bucket. ![image](assets/block_public_access.png) +3. Add the following text to Bucket Policy. + ```json + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowPublicRead", + "Effect": "Allow", + "Principal": { + "AWS": "*" + }, + "Action": "s3:GetObject", + "Resource": "arn:aws:s3:::{your-bucket-name}/*" + } + ] + } + ``` + +## Environment + +You must need this environments. + +``` +OPENAI_API_KEY +``` + +You need this environments. + +``` +serpapi: SERPAPI_API_KEY +bing-search: BING_SEARCH_URL, BING_SUBSCRIPTION_KEY +``` diff --git a/agent.py b/agent.py new file mode 100644 index 0000000..ee4f406 --- /dev/null +++ b/agent.py @@ -0,0 +1,79 @@ +from typing import Dict, List, Tuple + +from llm import ChatOpenAI +from langchain.agents import load_tools +from langchain.agents.agent import AgentExecutor +from langchain.agents.tools import Tool +from langchain.agents.initialize import initialize_agent +from langchain.chains.conversation.memory import ConversationBufferMemory + +from utils import AWESOMEGPT_PREFIX, AWESOMEGPT_SUFFIX + +from tools.cpu import ( + RequestsGet, + WineDB, + ExitConversation, +) +from tools.gpu import ( + ImageEditing, + InstructPix2Pix, + Text2Image, + ImageCaptioning, + VisualQuestionAnswering, +) +from handler import Handler, FileType + + +def get_agent() -> Tuple[AgentExecutor, Handler]: + print("Initializing AwesomeGPT") + llm = ChatOpenAI(temperature=0) + tools = [ + *load_tools( + ["python_repl", "terminal", "serpapi", "wikipedia", "bing-search"], + llm=llm, + ), + ] + memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) + + models = { + "RequestsGet": RequestsGet(), + "WineDB": WineDB(), + "ExitConversation": ExitConversation(memory), + "Text2Image": Text2Image("cuda"), + "ImageEditing": ImageEditing("cuda"), + "InstructPix2Pix": InstructPix2Pix("cuda"), + "VisualQuestionAnswering": VisualQuestionAnswering("cuda"), + } + + for _, instance in models.items(): + for e in dir(instance): + if e.startswith("inference"): + func = getattr(instance, e) + tools.append( + Tool(name=func.name, description=func.description, func=func) + ) + + handle_models: Dict[FileType, str] = { + FileType.IMAGE: ImageCaptioning("cuda"), + } + + handler = Handler( + handle_func={ + file_type: model.inference for file_type, model in handle_models.items() + } + ) + + return ( + initialize_agent( + tools, + llm, + agent="chat-conversational-react-description", + verbose=True, + memory=memory, + agent_kwargs={ + "system_message": AWESOMEGPT_PREFIX, + "human_message": AWESOMEGPT_SUFFIX, + }, + ), + handler, + ) diff --git a/assets/block_public_access.png b/assets/block_public_access.png new file mode 100644 index 0000000..ca744b4 Binary files /dev/null and b/assets/block_public_access.png differ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..fc2a547 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,16 @@ +version: "3" + +services: + awesomegpt: + build: + dockerfile: Dockerfile + context: . + env_file: + - .env + deploy: + resources: + reservations: + devices: + - driver: nvidia + device_ids: ["3"] # You can choose which GPU to use + capabilities: [gpu] diff --git a/file.py b/file.py deleted file mode 100644 index 4544294..0000000 --- a/file.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import requests -import uuid -from typing import Callable -from enum import Enum - -from PIL import Image - -import pandas as pd - -from utils import IMAGE_PROMPT, DATAFRAME_PROMPT -from tools import IMAGE_MODEL - - -class FileType(Enum): - IMAGE = "image" - AUDIO = "audio" - VIDEO = "video" - DATAFRAME = "dataframe" - UNKNOWN = "unknown" - - -def handle(file_name: str) -> Callable: - """ - Parse file type from file name (ex. image, audio, video, dataframe, etc.) - """ - file_name = file_name.split("?")[0] - - if file_name.endswith(".png") or file_name.endswith(".jpg"): - return handle_image - elif file_name.endswith(".mp3") or file_name.endswith(".wav"): - return handle_audio - elif file_name.endswith(".mp4") or file_name.endswith(".avi"): - return handle_video - elif file_name.endswith(".csv"): - return handle_dataframe - else: - return handle_unknown - - -def handle_image(i: int, file: str) -> str: - img_data = requests.get(file).content - filename = os.path.join("image", str(uuid.uuid4())[0:8] + ".png") - with open(filename, "wb") as f: - size = f.write(img_data) - print(f"Inputs: {file} ({size//1000}MB) => {filename}") - img = Image.open(filename) - width, height = img.size - ratio = min(512 / width, 512 / height) - width_new, height_new = (round(width * ratio), round(height * ratio)) - img = img.resize((width_new, height_new)) - img = img.convert("RGB") - img.save(filename, "PNG") - print(f"Resize image form {width}x{height} to {width_new}x{height_new}") - try: - description = IMAGE_MODEL.inference(filename) - except Exception as e: - return {"text": "image upload", "response": str(e), "additional": []} - - return IMAGE_PROMPT.format(i=i, filename=filename, description=description) - - -def handle_audio(i: int, file: str) -> str: - return "" - - -def handle_video(i: int, file: str) -> str: - return "" - - -def handle_dataframe(i: int, file: str) -> str: - content = requests.get(file).content - filename = os.path.join("dataframe/", str(uuid.uuid4())[0:8] + ".csv") - with open(filename, "wb") as f: - size = f.write(content) - print(f"Inputs: {file} ({size//1000}MB) => {filename}") - df = pd.read_csv(filename) - try: - description = str(df.describe()) - except Exception as e: - return {"text": "image upload", "response": str(e), "additional": []} - - return DATAFRAME_PROMPT.format(i=i, filename=filename, description=description) - - -def handle_unknown(i: int, file: str) -> str: - return "" diff --git a/handler.py b/handler.py new file mode 100644 index 0000000..69352f9 --- /dev/null +++ b/handler.py @@ -0,0 +1,89 @@ +import os +import requests +import uuid +from typing import Callable, Dict +from enum import Enum + +from PIL import Image + +import pandas as pd + +from utils import IMAGE_PROMPT, DATAFRAME_PROMPT + + +class FileType(Enum): + IMAGE = "image" + AUDIO = "audio" + VIDEO = "video" + DATAFRAME = "dataframe" + UNKNOWN = "unknown" + + +class Handler: + def __init__(self, handle_func: Dict[FileType, Callable]): + self.handle_func = handle_func + + def handle(self, i: int, file_name: str) -> str: + """ + Parse file type from file name (ex. image, audio, video, dataframe, etc.) + """ + file_type = file_name.split("?")[0] + + if file_type.endswith(".png") or file_type.endswith(".jpg"): + return self.handle_image(i, file_name) + elif file_type.endswith(".mp3") or file_type.endswith(".wav"): + return self.handle_audio(i, file_name) + elif file_type.endswith(".mp4") or file_type.endswith(".avi"): + return self.handle_video(i, file_name) + elif file_type.endswith(".csv"): + return self.handle_dataframe(i, file_name) + else: + return self.handle_unknown(i, file_name) + + def handle_image(self, i: int, remote_filename: str) -> str: + img_data = requests.get(remote_filename).content + local_filename = os.path.join("image", str(uuid.uuid4())[0:8] + ".png") + with open(local_filename, "wb") as f: + size = f.write(img_data) + print(f"Inputs: {remote_filename} ({size//1000}MB) => {local_filename}") + img = Image.open(local_filename) + width, height = img.size + ratio = min(512 / width, 512 / height) + width_new, height_new = (round(width * ratio), round(height * ratio)) + img = img.resize((width_new, height_new)) + img = img.convert("RGB") + img.save(local_filename, "PNG") + print(f"Resize image form {width}x{height} to {width_new}x{height_new}") + try: + description = self.handle_func[FileType.IMAGE](local_filename) + except Exception as e: + return "Error: " + str(e) + + return IMAGE_PROMPT.format( + i=i, filename=local_filename, description=description + ) + + def handle_audio(self, i: int, remote_filename: str) -> str: + return "" + + def handle_video(self, i: int, remote_filename: str) -> str: + return "" + + def handle_dataframe(self, i: int, remote_filename: str) -> str: + content = requests.get(remote_filename).content + local_filename = os.path.join("dataframe/", str(uuid.uuid4())[0:8] + ".csv") + with open(local_filename, "wb") as f: + size = f.write(content) + print(f"Inputs: {remote_filename} ({size//1000}MB) => {local_filename}") + df = pd.read_csv(local_filename) + try: + description = str(df.describe()) + except Exception as e: + return "Error: " + str(e) + + return DATAFRAME_PROMPT.format( + i=i, filename=local_filename, description=description + ) + + def handle_unknown(self, i: int, file: str) -> str: + return "" diff --git a/main.py b/main.py index ecbff76..c794bc5 100644 --- a/main.py +++ b/main.py @@ -1,115 +1,73 @@ -from typing import List, TypedDict, Callable +from typing import List, TypedDict import re -from langchain.agents import load_tools -from langchain.agents.initialize import initialize_agent -from langchain.agents.tools import Tool - - from fastapi import FastAPI from pydantic import BaseModel -from dotenv import load_dotenv from s3 import upload -from llm import ChatOpenAI -from file import handle -from utils import ( - AWESOMEGPT_PREFIX, - AWESOMEGPT_SUFFIX, - ERROR_PROMPT, -) -from tools import AWESOME_MODEL, memory - -load_dotenv() +from utils import ERROR_PROMPT +from agent import get_agent app = FastAPI() - - -print("Initializing AwesomeGPT") -llm = ChatOpenAI(temperature=0) -tools = [ - *load_tools( - ["python_repl", "serpapi", "wikipedia", "bing-search"], - llm=llm, - ), -] - -for class_name, instance in AWESOME_MODEL.items(): - for e in dir(instance): - if e.startswith("inference"): - func = getattr(instance, e) - tools.append(Tool(name=func.name, description=func.description, func=func)) - -agent = initialize_agent( - tools, - llm, - agent="chat-conversational-react-description", - verbose=True, - memory=memory, - agent_kwargs={ - "system_message": AWESOMEGPT_PREFIX, - "human_message": AWESOMEGPT_SUFFIX, - }, -) +agent, handler = get_agent() class Request(BaseModel): - text: str - state: List[str] - files: List[str] key: str + query: str + files: List[str] class Response(TypedDict): - text: str response: str - additional: List[str] + files: List[str] @app.get("/") async def index(): - return {"message": "Hello World"} + return {"message": "Hello World. I'm AwesomeGPT."} @app.post("/command") async def command(request: Request) -> Response: - text = request.text - state = request.state + query = request.query files = request.files key = request.key print("=============== Running =============") - print("Inputs:", text, state, files) + print("Inputs:", query, files) # TODO - add state to memory (use key) print("======>Previous memory:\n %s" % agent.memory) - promptedText = "" + promptedQuery = "" + import time for i, file in enumerate(files): - promptedText += handle(file)(i + 1, file) + promptedQuery += handler.handle(i + 1, file) - promptedText += text + promptedQuery += query - print("======>Prompted Text:\n %s" % promptedText) + print("======>Prompted Text:\n %s" % promptedQuery) try: - res = agent({"input": promptedText}) + res = agent({"input": promptedQuery}) except Exception as e: try: res = agent( { - "input": ERROR_PROMPT.format(promptedText=promptedText, e=str(e)), + "input": ERROR_PROMPT.format(promptedQuery=promptedQuery, e=str(e)), } ) except Exception as e: - return {"text": promptedText, "response": str(e), "additional": []} + return {"response": str(e), "files": []} images = re.findall("(image/\S*png)", res["output"]) + dataframes = re.findall("(dataframe/\S*csv)", res["output"]) return { - "text": promptedText, "response": res["output"], - "additional": [upload(image) for image in images], + "files": [upload(image) for image in images] + + [upload(dataframe) for dataframe in dataframes], } diff --git a/requirements.txt b/requirements.txt index b8827be..bf1cf60 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,6 +6,10 @@ langchain fastapi boto3 llama_index -torch==1.13.1+cu117 +torch transformers -diffusers \ No newline at end of file +diffusers +python_dotenv +google-search-results +psycopg2-binary +wikipedia \ No newline at end of file diff --git a/tools.py b/tools/cpu.py similarity index 80% rename from tools.py rename to tools/cpu.py index da81aa8..8528300 100644 --- a/tools.py +++ b/tools/cpu.py @@ -1,22 +1,12 @@ -from langchain.chains.conversation.memory import ConversationBufferMemory - from utils import prompts from env import settings -from vfm import ( - ImageEditing, - InstructPix2Pix, - Text2Image, - ImageCaptioning, - VisualQuestionAnswering, -) import requests from llama_index.readers.database import DatabaseReader from llama_index import GPTSimpleVectorIndex - -memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) +from langchain.memory.chat_memory import BaseChatMemory class RequestsGet: @@ -79,6 +69,9 @@ class WineDB: class ExitConversation: + def __init__(self, memory: BaseChatMemory): + self.memory = memory + @prompts( name="exit_conversation", description="A tool to exit the conversation. " @@ -87,19 +80,5 @@ class ExitConversation: ) def inference(self, query: str) -> str: """Run the tool.""" - memory.chat_memory.messages = [] + self.memory.chat_memory.messages = [] return "" - - -IMAGE_MODEL = ImageCaptioning("cuda:3") - - -AWESOME_MODEL = { - "RequestsGet": RequestsGet(), - "WineDB": WineDB(), - "ExitConversation": ExitConversation(), - "Text2Image": Text2Image("cuda:3"), - "ImageEditing": ImageEditing("cuda:3"), - "InstructPix2Pix": InstructPix2Pix("cuda:3"), - "VisualQuestionAnswering": VisualQuestionAnswering("cuda:3"), -} diff --git a/vfm.py b/tools/gpu.py similarity index 100% rename from vfm.py rename to tools/gpu.py diff --git a/utils.py b/utils.py index bf24e67..92e12e1 100644 --- a/utils.py +++ b/utils.py @@ -8,29 +8,44 @@ from langchain.output_parsers.base import BaseOutputParser IMAGE_PROMPT = """ -{i}th image: provide a figure named {filename}. The description is: {description}. +{i}th file: provide a figure named {filename}. The description is: {description}. + +Please understand and answer the image based on this information. The image understanding is complete, so don't try to understand the image again. """ -DATAFRAME_PROMPT = """ -{i}th dataframe: provide a dataframe named {filename}. The description is: {description}. +AUDIO_PROMPT = """ +{i}th file: provide a audio named {filename}. The description is: {description}. + +Please understand and answer the audio based on this information. The audio understanding is complete, so don't try to understand the audio again. """ +VIDEO_PROMPT = """ +{i}th file: provide a video named {filename}. The description is: {description}. -IMAGE_SUFFIX = """ -Please understand and answer the image based on this information. The image understanding is complete, so don't try to understand the image again. +Please understand and answer the video based on this information. The video understanding is complete, so don't try to understand the video again. """ -AWESOMEGPT_PREFIX = """Awesome GPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. Awesome GPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. +DATAFRAME_PROMPT = """ +{i}th file: provide a dataframe named {filename}. The description is: {description}. + +You are able to use the dataframe to answer the question. +You have to act like an data analyst who can do an effective analysis through dataframe. +""" -Awesome GPT is able to process and understand large amounts of text and images. As a language model, Awesome GPT can not directly read images, but it has a list of tools to finish different visual tasks. +AWESOMEGPT_PREFIX = """Awesome GPT is designed to be able to assist with a wide range of text and visual related tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. +Awesome GPT is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. +Awesome GPT is able to process and understand large amounts of various types of files. As a language model, Awesome GPT can not directly read various types of files, but it has a list of tools to finish different visual tasks. Each image will have a file name formed as "image/xxx.png" +Each audio will have a file name formed as "audio/xxx.mp3" +Each video will have a file name formed as "video/xxx.mp4" Each dataframe will have a file name formed as "dataframe/xxx.csv" -Awesome GPT can invoke different tools to indirectly understand pictures. When talking about images, Awesome GPT is very strict to the file name and will never fabricate nonexistent files. When using tools to generate new image files, Awesome GPT is also known that the image may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real image. Awesome GPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the image content and image file name. It will remember to provide the file name from the last tool observation, if a new image is generated. - -Human may provide new figures to Awesome GPT with a description. The description helps Awesome GPT to understand this image, but Awesome GPT should use tools to finish following tasks, rather than directly imagine from the description. +Awesome GPT can invoke different tools to indirectly understand files. When talking about files, Awesome GPT is very strict to the file name and will never fabricate nonexistent files. +When using tools to generate new files, Awesome GPT is also known that the file may not be the same as the user's demand, and will use other visual question answering tools or description tools to observe the real file. +Awesome GPT is able to use tools in a sequence, and is loyal to the tool observation outputs rather than faking the file content and file name. It will remember to provide the file name from the last tool observation, if a new file is generated. +Human may provide new figures to Awesome GPT with a description. The description helps Awesome GPT to understand this file, but Awesome GPT should use tools to finish following tasks, rather than directly imagine from the description. Overall, Awesome GPT is a powerful visual dialogue assistant tool that can help with a wide range of tasks and provide valuable insights and information on a wide range of topics.""" @@ -38,7 +53,8 @@ AWESOMEGPT_SUFFIX = """TOOLS ------ Awesome GPT can ask the user to use tools to look up information that may be helpful in answering the users original question. You are very strict to the filename correctness and will never fake a file name if it does not exist. -You will remember to provide the image file name loyally if it's provided in the last tool observation. +You will remember to provide the file name loyally if it's provided in the last tool observation. + The tools the human can use are: {{tools}} @@ -51,10 +67,12 @@ Here is the user's input (remember to respond with a markdown code snippet of a {{{{input}}}}""" -ERROR_PROMPT = "An error has occurred for the following text: \n{promptedText} Please explain this error.\n {e}" +ERROR_PROMPT = "An error has occurred for the following text: \n{promptedQuery} Please explain this error.\n {e}" os.makedirs("image", exist_ok=True) +os.makedirs("audio", exist_ok=True) +os.makedirs("video", exist_ok=True) os.makedirs("dataframe", exist_ok=True)