From 82242dfbb1d302958527d851be65341b3d532839 Mon Sep 17 00:00:00 2001 From: Erick Friis Date: Fri, 18 Oct 2024 10:06:55 -0700 Subject: [PATCH 1/8] docs: openai audio docs (#27459) --- docs/docs/integrations/chat/openai.ipynb | 156 ++++++++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) diff --git a/docs/docs/integrations/chat/openai.ipynb b/docs/docs/integrations/chat/openai.ipynb index 1aa43bcd08..5d7f5c9424 100644 --- a/docs/docs/integrations/chat/openai.ipynb +++ b/docs/docs/integrations/chat/openai.ipynb @@ -434,6 +434,160 @@ "fine_tuned_model.invoke(messages)" ] }, + { + "cell_type": "markdown", + "id": "5d5d9793", + "metadata": {}, + "source": [ + "## Multimodal Inputs\n", + "\n", + "OpenAI has models that support multimodal inputs. You can pass in images or audio to these models. For more information on how to do this in LangChain, head to the [multimodal inputs](/docs/how_to/multimodal_inputs) docs.\n", + "\n", + "You can see the list of models that support different modalities in [OpenAI's documentation](https://platform.openai.com/docs/models).\n", + "\n", + "At the time of this doc's writing, the main OpenAI models you would use would be:\n", + "\n", + "- Image inputs: `gpt-4o`, `gpt-4o-mini`\n", + "- Audio inputs: `gpt-4o-audio-preview`\n", + "\n", + "For an example of passing in image inputs, see the [multimodal inputs how-to guide](/docs/how_to/multimodal_inputs).\n", + "\n", + "Below is an example of passing audio inputs to `gpt-4o-audio-preview`:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "39d08780", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"I'm sorry, but I can't create audio content that involves yelling. Is there anything else I can help you with?\"" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import base64\n", + "\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(\n", + " model=\"gpt-4o-audio-preview\",\n", + " temperature=0,\n", + ")\n", + "\n", + "with open(\n", + " \"../../../../libs/partners/openai/tests/integration_tests/chat_models/audio_input.wav\",\n", + " \"rb\",\n", + ") as f:\n", + " # b64 encode it\n", + " audio = f.read()\n", + " audio_b64 = base64.b64encode(audio).decode()\n", + "\n", + "\n", + "output_message = llm.invoke(\n", + " [\n", + " (\n", + " \"human\",\n", + " [\n", + " {\"type\": \"text\", \"text\": \"Transcribe the following:\"},\n", + " # the audio clip says \"I'm sorry, but I can't create...\"\n", + " {\n", + " \"type\": \"input_audio\",\n", + " \"input_audio\": {\"data\": audio_b64, \"format\": \"wav\"},\n", + " },\n", + " ],\n", + " ),\n", + " ]\n", + ")\n", + "output_message.content" + ] + }, + { + "cell_type": "markdown", + "id": "feb4a499", + "metadata": {}, + "source": [ + "## Audio Generation (Preview)\n", + "\n", + ":::info\n", + "Requires `langchain-openai>=0.2.3`\n", + ":::\n", + "\n", + "OpenAI has a new [audio generation feature](https://platform.openai.com/docs/guides/audio?audio-generation-quickstart-example=audio-out) that allows you to use audio inputs and outputs with the `gpt-4o-audio-preview` model." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f67a2cac", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI(\n", + " model=\"gpt-4o-audio-preview\",\n", + " temperature=0,\n", + " model_kwargs={\n", + " \"modalities\": [\"text\", \"audio\"],\n", + " \"audio\": {\"voice\": \"alloy\", \"format\": \"wav\"},\n", + " },\n", + ")\n", + "\n", + "output_message = llm.invoke(\n", + " [\n", + " (\"human\", \"Are you made by OpenAI? Just answer yes or no\"),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b7dd4e8b", + "metadata": {}, + "source": [ + "`output_message.additional_kwargs['audio']` will contain a dictionary like\n", + "```python\n", + "{\n", + " 'data': '