langchain/cookbook/multi_modal_QA.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61ccf657-87fd-4541-bd06-b66288c150b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install \"openai>=1\" \"langchain>=0.0.331rc2\" matplotlib pillow"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aa5c8fc8-67c3-4fb7-aa37-e1a5d6682170",
   "metadata": {},
   "source": [
    "## Load Images\n",
    "\n",
    "We encode to base64, as noted in the [OpenAI GPT-4V doc](https://platform.openai.com/docs/guides/vision)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e67eb395-f960-4833-a0e0-1cc6a0131f55",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<img src=\"data:image/jpeg;base64,iVBORw0KGgoAAAANSUhEUgAAB9AAAAO1CAIAAAC5LOnUAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgAElEQVR4nOzdB3gUZdfG8UkCAULoVUCKSq8iYgNBUToIhHQI0qQZFSkiIAiK+CqICvKBdFERQZCOiNJEaaE3pfeQtmmb7TvflZyXx3k3ITbULPx/V66wmZ15ppBdwj0n59F0AAAAAAAAAADwl2l/fQgAAAAAAAAAAEDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADALUDgDgAAAAAAAADAHRO4u7Pouu5yudxut8vlksfyIPtq6ktZX76U9Y0r5Lgjp9PpMaxwZrl15wQAAAAAAAAAuK14TeBuzLtV7P43Be5qR2pAt9vtyOJ0Om02m67rSUlJCQkJFy9eNJvNf8MZAwAAAAAAAAC8jBcE7ir1ttvt165dUzG6R7yey7a/Z03jJiqXd7lcDofDuMRut5vNZpvN9vHHH7/55psvvfTS/v37b8VZAgAAAAAAAAC8mxcE7rqu22w2k8l09OjRl19+2Ww2WywWp9Mp9eY320RScofDYc/y+7vBOBwOi8Vit9uljYwE7h7l8yaTqWbNmlqWBQsW3KKzBAAAAAAAAAB4Me8I3GNjY99+++2iRYsWKFDg+vXrUmku/WT+UPW6ZOg5tmg3yj6gKpOX4N5isdSvX79AgQKapi1evPhPnRMAAAAAAAAA4LbiHYH7smXLAgMDfXx8AgMDMzIyXC5X9jbrHqQ4XX35O0N5j7bvagSPmN5isTRs2NDX15cKdwAAAAAAAACANwXu69atq1q1qqZphQoVslqtuc99agzNjbOe/uaMqR5cLpfdbleby0KHw2G1WtPT0++//34J3L/44otbcYoAAAAAAAAAAO/mHYH72rVry5YtK4F7enq6rusZGRnGAnZFgnVjx3aVlbvdbmn7/ocmUBWuLPJA9t6kSRMJ3D/55JNbdJYAAAAAAAAAAC+WpwN3lYxv2bKlVKlSmqYVLVr0d27rcrmuX78eFxeXlpZmHPA3G7gnJiaaTCZjKK/SdnVIZrO5du3a/v7+mqZ9+umnf+rkAAAAAAAAAAC3lTwUuBvTcJV3f/fddx9++GG/fv0CAwM1TStYsODkyZPfe++9999//9SpUx4jpKenr1ixYsaMGZMmTXrhhRd69erVp0+fgQMHjho1avHixQkJCR67U4XwFotl3bp1U6ZMGTZsWP/+/QcMGDB06ND33ntv7ty5R44cMR6SfE5JSalbt27BggVvFrjv3r171qxZ77777uTJk1etWmW1WmWi19/fTR4AAAAAAAAA4F3yUOAurWAkB1epdL9+/bQsvll8fHzkS03TVqxYIVvJmrt3754wYULt2rXz58+vZVOpUqWhQ4fu3r07IyNDess4HA7Z9tKlS1OmTGnUqFH2rYoXLx4ZGbl69WqTySR7kQNLS0tr0qSJR4W7FMLrun7kyJHOnTvLHYL77rtv+vTpErhbLBZ1wGplAAAAAAAAAMDtIQ8F7qpVutVqVYH7mDFj7rvvvrvuuit//vz5stxzzz333Xdf/fr1v/vuO0muLRbL/v37n376aUnJy5QpU79+/aZNmzZp0qRhw4bVqlWTZuuaprVr1+7gwYO6rttsNunnnpKSMmzYMHm2dOnStWvXfvDBB5s2bVq9evWSJUvK8qpVq3755ZdSDi8pucViUZOmLlmyRM2warfbT58+3blzZ9nwnnvumTVrls1mk9lWZQSHw6HO9N++5AAAAAAAAACA27rCXbqvSDx94sSJn3766d133y1WrJimaQUKFNi4ceOOHTu+++67K1euyIanT59u3769VME3bNjwP//5T0xMzNUsJ0+e/Oqrr5o1a1agQIEiRYpomvbxxx+r3SUkJHz77bcSjt91113Dhw//6aef0tLSTCbTypUrx48fX69ePSlj79ixY0xMjJpG1WKxNGzY0DhpqtPptFqtZ86cCQsLkwErVKggWbyE7BkZGXJGdrudrjIAAAAAAAAAcPvJQ4G70+m0WCxSDy5F6KoGfO3atcWLF/fx8SlUqJDdbveoDd+4caOk7YGBgT/++GP2kVNTU1u1apUvXz5N04KCgiQ613XdZDINHjw4ICBA07Tp06enp6d7bDhv3jypcy9VqtT8+fPVcgncpXfNggULZOGJEyc6deqkaZqPj0+ZMmW++eYbdV7SvV0hcAcAAAAAAACA209eCdylclwauKu5TKXxiwTuEnwHBASkpKQYNzx37lx0dLQE7tHR0cnJyVJFLhvKgDabbdq0aSVKlJCuMj/88INsGxcX16ZNGylU37hxo/SEkQL2jIwMWScyMvLJJ58MDw//9ttvVVZuNptV4C5l7CdOnAgPD5fa9nvvvXffvn1yV8CYravzknMkdgcAAAAAAACA20leCdydTqeKyFVc7nK5JJVeu3ZtqVKlNE0rVqxYbGysrC+d0G02W1xc3M6dO5cvX37hwgXppS7RtjG737hxo4zQokWL77//XhYmJiYOGjTIz89P07RWrVodOnRIHY/NZpPxr127FhcXl5iYaLFYZLZVXdczMjJUD/cvv/zy559/Dg0N1TQtf/78tWrV+uGHH5KTk91ut81mU/3opeeMDK7mawUAAAAAAAAA3DbyXIW7hOyqk7tUmq9Zs0bq0wsXLpyWlqYqx9Wabrc7IyPDbDZnH/natWurVq3q16+fdIF/7LHHZLZVXdfT09OXLVsmZekFCxZs3Lhx//79Z82adfjw4Rz7y7tcLgncLRZLo0aNJHAfMWJE//79CxYsKPO1Ll++XML67DOjymOZPZXydgAAAAAAAAC4zeSVwF1ISC1l6RJMS8C9evVqqU8vVKiQCtzVysZQe+/evWvXrp05c+Zrr702ePDg0NDQtm3b1qtXr0SJElLJ3rx5861bt6puM5cvX46KipLoXMa/9957H3/88d69e0+YMGHlypXXrl2TkSUll6BcAncZsEqVKnIzQNO0wMDAUaNGGSd9lR05shibycjZ/UuXGQAAAAAAAABwuwfuQgJ0Yx69atWq4sWLSyCemprqEc3rup6cnLxp06axY8e2bdv2gQceqFSpkkyRqlSqVElS9WbNmm3evFmN7HA4Tp48OWjQoMqVKxvX1zStePHiTZo0iYiI+PjjjxMSEuTAVIV7vXr1fH19ZS8+Pj7FihXz9/eXHe3cudNisRjvCqj8XR57tKq/o6ibFupWhPrSZrOlpKTI7weoXwIwrg8AAAAAAAAAeVleDNyzW7NmjUyaqircJbyWaD4xMXH27NlNmzZVWXn+/PkrVqzYqFGjli1bBgcHDx06dPjw4UWLFpWWMjL9qZTGywhXr16dPXt27969W7ZsWbt2bVWxLsqWLTtmzJgzZ85I6br0cK9Tp44E7v7+/m3bth09enTr1q01TcuXL1/r1q0vX76sjlAyd3UjwWaz3ckN3FXXIOPtB/kyNjZ2+/btJ06ckOuset/fyZcLAAAAAAAAgBfxmsC9TJkymqYFBAQkJyer3Fb6sC9duvTuu++WsLtu3bpdu3YdMmTItGnT1q9ff+LECclt169fX65cOWNLGavVmj3JPXny5Oeffz5ixIjmzZvXqVOnbNmyBQoUkNh99OjRatZTs9lcv359KW9v1qzZli1bdF1funRp4cKFZeVp06bJcapduN1u2ZySbWO/frk48uDYsWOTJk1atGiR8W/Eow8+AAAAAAAAAORZ3hG4r1q1SrLsIkWKqB7ukl8fPHiwTZs2mqb5+vo+/PDDmzdvNnYskQc2m+2TTz6RpjSyjgp5rVZrSkpKampq9gYvBw4cmDx5cpUqVfLnz69pWuvWr
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import base64\n",
    "import io\n",
    "import os\n",
    "\n",
    "import numpy as np\n",
    "from IPython.display import HTML, display\n",
    "from PIL import Image\n",
    "\n",
    "\n",
    "def encode_image(image_path):\n",
    "    \"\"\"Getting the base64 string\"\"\"\n",
    "\n",
    "    with open(image_path, \"rb\") as image_file:\n",
    "        return base64.b64encode(image_file.read()).decode(\"utf-8\")\n",
    "\n",
    "\n",
    "def plt_img_base64(img_base64):\n",
    "    \"\"\"Display the base64 image\"\"\"\n",
    "\n",
    "    # Create an HTML img tag with the base64 string as the source\n",
    "    image_html = f'<img src=\"data:image/jpeg;base64,{img_base64}\" />'\n",
    "\n",
    "    # Display the image by rendering the HTML\n",
    "    display(HTML(image_html))\n",
    "\n",
    "\n",
    "# Image for QA\n",
    "path = \"/Users/rlm/Desktop/Multimodal_Eval/qa/llm_strategies.jpeg\"\n",
    "img_base64 = encode_image(path)\n",
    "plt_img_base64(img_base64)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "19bf59e1-ab31-4943-8f62-076d8de64b9d",
   "metadata": {},
   "source": [
    "## QA with GPT-4Vision\n",
    "\n",
    "We can use GPT-4V to perform QA on images. See here for more detail:\n",
    "* https://github.com/openai/openai-python/releases/tag/v1.0.0\n",
    "* https://platform.openai.com/docs/guides/vision"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "19b8f89b-cc1c-4fd1-80fe-08c17bc6a30f",
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain_core.messages import HumanMessage, SystemMessage\n",
    "from langchain_openai import ChatOpenAI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "88033140-978c-4782-a721-703c3da634b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "chat = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=1024)\n",
    "\n",
    "msg = chat.invoke(\n",
    "    [\n",
    "        HumanMessage(\n",
    "            content=[\n",
    "                {\n",
    "                    \"type\": \"text\",\n",
    "                    \"text\": \"Based on the image, what is the difference in training strategy between a small and a large base model?\",\n",
    "                },\n",
    "                {\n",
    "                    \"type\": \"image_url\",\n",
    "                    \"image_url\": {\"url\": f\"data:image/jpeg;base64,{img_base64}\"},\n",
    "                },\n",
    "            ]\n",
    "        )\n",
    "    ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c415ce7-4ac4-46fe-82a4-7bf9d677b97a",
   "metadata": {},
   "source": [
    "The results `msg.content` is shown below:"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8580c74f-0938-4986-80a9-8fc39e1913e3",
   "metadata": {},
   "source": [
    "The image appears to be a graph depicting the task accuracy of two different base model sizes (big and small) as a function of different training strategies and the effort/complexity associated with them. Here's a description of the differences in training strategy between a small and a large base model as suggested by the graph:\n",
    "\n",
    "1. **Zero-shot prompts**: Both models start with some baseline accuracy with no additional training, which is indicative of zero-shot learning capabilities. However, the big base model shows higher accuracy out of the box compared to the small base model.\n",
    "\n",
    "2. **Prompt engineering**: As the complexity increases with prompt engineering, the big base model shows a significant improvement in task accuracy, indicating that it can understand and leverage well-engineered prompts more effectively than the small base model.\n",
    "\n",
    "3. **Few-shot prompts**: With the introduction of few-shot prompts, where the model is given a few examples to learn from, the big base model continues to show higher task accuracy in comparison to the small base model, which also improves but not to the same extent.\n",
    "\n",
    "4. **Retrieval-augmented few-shot prompting**: At this stage, the models are enhanced with retrieval mechanisms to assist in the few-shot learning process. The big base model maintains a lead in task accuracy, demonstrating that it can better integrate retrieval-augmented strategies.\n",
    "\n",
    "5. **Finetuning**: As we move towards the right side of the graph, which represents finetuning, the small base model shows a more significant increase in accuracy compared to previous steps, suggesting that finetuning has a substantial impact on smaller models. The big base model, while also benefiting from finetuning, does not show as dramatic an increase, likely because it was already performing at a higher level due to its larger size and capacity.\n",
    "\n",
    "6. **Model training (finetuning, RLHF) & data engine**: The final section of the graph indicates that with extensive model training techniques like finetuning and Reinforcement Learning from Human Feedback (RLHF), combined with a robust data engine, the big base model can achieve near-perfect task accuracy. The small base model also improves but does not reach the same level, indicating that the larger model's capacity enables it to better utilize advanced training methods and data resources.\n",
    "\n",
    "In summary, the big base model benefits more from advanced training strategies and demonstrates higher task accuracy with increased effort and complexity, while the small base model requires more significant finetuning to achieve substantial improvements in performance.\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2552b0e6-9d07-40f1-8fbc-17567bd0fdd1",
   "metadata": {},
   "source": [
    "## QA with OSS Multi-modal LLMs\n",
    "\n",
    "We cam also test various open source multi-modal LLMs.\n",
    "\n",
    "See [here](https://github.com/langchain-ai/langchain/blob/master/cookbook/Semi_structured_and_multi_modal_RAG.ipynb) for instructions to build llama.cpp for multi-modal LLMs:\n",
    "\n",
    "Clone [llama.cpp](https://github.com/ggerganov/llama.cpp)\n",
    "\n",
    "Download the weights:\n",
    "* [LLaVA-7b](https://huggingface.co/mys/ggml_llava-v1.5-7b/tree/main)\n",
    "* [LLaVA-13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)\n",
    "* [Bakllava](https://huggingface.co/mys/ggml_bakllava-1/tree/main)\n",
    "\n",
    "Build in your `llama.cpp` directory:\n",
    "```\n",
    "mkdir build && cd build && cmake ..\n",
    "cmake --build .\n",
    "```\n",
    "\n",
    "Support for multi-modal LLMs will [soon be added to llama.cpp](https://github.com/abetlen/llama-cpp-python/issues/813).\n",
    "\n",
    "In the meantime, you can test them with the CLI:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1293d0df-c979-4c53-9af5-c3bf918aad04",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%bash\n",
    "\n",
    "# Define the path to the image\n",
    "IMG_PATH=\"/Users/rlm/Desktop/Multimodal_Eval/qa/llm_strategies.jpeg\"\n",
    "\n",
    "# Define the model name\n",
    "#MODEL_NAME=\"llava-7b\"\n",
    "#MODEL_NAME=\"bakllava-1\"\n",
    "MODEL_NAME=\"llava-13b\"\n",
    "\n",
    "# Execute the command and save the output to the defined output file\n",
    "/Users/rlm/Desktop/Code/llama.cpp/build/bin/llava -m /Users/rlm/Desktop/Code/llama.cpp/models/${MODEL_NAME}/ggml-model-q5_k.gguf --mmproj /Users/rlm/Desktop/Code/llama.cpp/models/${MODEL_NAME}/mmproj-model-f16.gguf --temp 0.1 -p \"Based on the image, what is the difference in training strategy between a small and a large base model?\" --image \"$IMG_PATH\""
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}