initial commit for Azure RAG cookbook (#1272)

Co-authored-by: juston <96567547+justonf@users.noreply.github.com>
pull/1328/head
Max Reid 2 months ago committed by GitHub
parent f6ea13ebed
commit 5f552669f7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,50 @@
bin
obj
csx
.vs
edge
Publish
*.user
*.suo
*.cscfg
*.Cache
project.lock.json
/packages
/TestResults
/tools/NuGet.exe
/App_Data
/secrets
/data
.secrets
appsettings.json
local.settings.json
node_modules
dist
vector_database_wikipedia_articles_embedded
# Local python packages
.python_packages/
# Python Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# Azurite artifacts
__blobstorage__
__queuestorage__
__azurite_db*__.json
vector_database_wikipedia_articles_embedded

@ -0,0 +1,5 @@
{
"recommendations": [
"ms-azuretools.vscode-azurefunctions"
]
}

@ -0,0 +1,153 @@
import azure.functions as func
import json
import logging
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.core.credentials import AzureKeyCredential
from openai import OpenAI
import os
from azure.search.documents.models import (
VectorizedQuery
)
# Initialize the Azure Function App
app = func.FunctionApp()
def generate_embeddings(text):
# Check if text is provided
if not text:
logging.error("No text provided in the query string.")
return func.HttpResponse(
"Please provide text in the query string.",
status_code=400
)
try:
# Initialize OpenAI client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
logging.info("OpenAI client initialized successfully.")
# Generate embeddings using OpenAI API
response = client.embeddings.create(
input=text,
model=os.getenv("EMBEDDINGS_MODEL")
)
logging.info("Embeddings created successfully.")
# Extract the embedding from the response
embedding = response.data[0].embedding
logging.debug(f"Generated embedding: {embedding}")
return embedding
except Exception as e:
logging.error(f"Error generating embeddings: {str(e)}")
return func.HttpResponse(
f"Error generating embeddings: {str(e)}",
status_code=500
)
@app.route(route="vector_similarity_search", auth_level=func.AuthLevel.ANONYMOUS)
def vector_similarity_search(req: func.HttpRequest) -> func.HttpResponse:
logging.info("Received request for vector similarity search.")
try:
# Parse the request body as JSON
req_body = req.get_json()
logging.info("Request body parsed successfully.")
except ValueError:
logging.error("Invalid JSON in request body.")
return func.HttpResponse(
"Invalid JSON in request body.",
status_code=400
)
# Extract parameters from the request body
search_service_endpoint = req_body.get('search_service_endpoint')
index_name = req_body.get('index_name')
query = req_body.get('query')
k_nearest_neighbors = req_body.get('k_nearest_neighbors')
search_column = req_body.get('search_column')
use_hybrid_query = req_body.get('use_hybrid_query')
logging.info(f"Parsed request parameters: search_service_endpoint={search_service_endpoint}, index_name={index_name}, query={query}, k_nearest_neighbors={k_nearest_neighbors}, search_column={search_column}, use_hybrid_query={use_hybrid_query}")
# Generate embeddings for the query
embeddings = generate_embeddings(query)
logging.info(f"Generated embeddings: {embeddings}")
# Check for required parameters
if not (search_service_endpoint and index_name and query):
logging.error("Missing required parameters in request body.")
return func.HttpResponse(
"Please provide search_service_endpoint, index_name, and query in the request body.",
status_code=400
)
try:
# Create a vectorized query
vector_query = VectorizedQuery(vector=embeddings, k_nearest_neighbors=float(k_nearest_neighbors), fields=search_column)
logging.info("Vector query generated successfully.")
except Exception as e:
logging.error(f"Error generating vector query: {str(e)}")
return func.HttpResponse(
f"Error generating vector query: {str(e)}",
status_code=500
)
try:
# Initialize the search client
search_client = SearchClient(
endpoint=search_service_endpoint,
index_name=index_name,
credential=AzureKeyCredential(os.getenv("SEARCH_SERVICE_API_KEY"))
)
logging.info("Search client created successfully.")
# Initialize the index client and get the index schema
index_client = SearchIndexClient(endpoint=search_service_endpoint, credential=AzureKeyCredential(os.getenv("SEARCH_SERVICE_API_KEY")))
index_schema = index_client.get_index(index_name)
for field in index_schema.fields:
logging.info(f"Field: {field.name}, Type: {field.type}")
# Filter out non-vector fields
non_vector_fields = [field.name for field in index_schema.fields if field.type not in ["Edm.ComplexType", "Collection(Edm.ComplexType)","Edm.Vector","Collection(Edm.Single)"]]
logging.info(f"Non-vector fields in the index: {non_vector_fields}")
except Exception as e:
logging.error(f"Error creating search client: {str(e)}")
return func.HttpResponse(
f"Error creating search client: {str(e)}",
status_code=500
)
# Determine if hybrid query should be used
search_text = query if use_hybrid_query else None
try:
# Perform the search
results = search_client.search(
search_text=search_text,
vector_queries=[vector_query],
select=non_vector_fields,
top=3
)
logging.info("Search performed successfully.")
except Exception as e:
logging.error(f"Error performing search: {str(e)}")
return func.HttpResponse(
f"Error performing search: {str(e)}",
status_code=500
)
try:
# Extract relevant data from results and put it into a list of dictionaries
response_data = [result for result in results]
response_data = json.dumps(response_data)
logging.info("Search results processed successfully.")
except Exception as e:
logging.error(f"Error processing search results: {str(e)}")
return func.HttpResponse(
f"Error processing search results: {str(e)}",
status_code=500
)
logging.info("Returning search results.")
return func.HttpResponse(response_data, mimetype="application/json")

@ -0,0 +1,15 @@
{
"version": "2.0",
"logging": {
"applicationInsights": {
"samplingSettings": {
"isEnabled": true,
"excludedTypes": "Request"
}
}
},
"extensionBundle": {
"id": "Microsoft.Azure.Functions.ExtensionBundle",
"version": "[4.*, 5.0.0)"
}
}

@ -0,0 +1,17 @@
# Do not include azure-functions-worker in this file
# The Python Worker is managed by the Azure Functions platform
# Manually managing azure-functions-worker may cause unexpected issues
azure-functions
azure-search-documents
azure-identity
openai
azure-mgmt-search
pandas
azure-mgmt-resource
azure-mgmt-storage
azure-mgmt-web
python-dotenv
pyperclip
PyPDF2
tiktoken

@ -0,0 +1,19 @@
{
"scriptFile": "__init__.py",
"bindings": [
{
"authLevel": "Anonymous",
"type": "httpTrigger",
"direction": "in",
"name": "req",
"methods": [
"post"
]
},
{
"type": "http",
"direction": "out",
"name": "$return"
}
]
}

@ -0,0 +1,34 @@
# Action authentication
Actions offer different authentication schemas to accommodate various use cases. To specify the authentication schema for your action, use the GPT editor and select "None", "API Key", or "OAuth".
By default, the authentication method for all actions is set to "None", but you can change this and allow different actions to have different authentication methods.
## No authentication
We support flows without authentication for applications where users can send requests directly to your API without needing an API key or signing in with OAuth.
Consider using no authentication for initial user interactions as you might experience a user drop off if they are forced to sign into an application. You can create a "signed out" experience and then move users to a "signed in" experience by enabling a separate action.
## API key authentication
Just like how a user might already be using your API, we allow API key authentication through the GPT editor UI. We encrypt the secret key when we store it in our database to keep your API key secure.
This approach is useful if you have an API that takes slightly more consequential actions than the no authentication flow but does not require an individual user to sign in. Adding API key authentication can protect your API and give you more fine-grained access controls along with visibility into where requests are coming from.
## OAuth
Actions allow OAuth sign in for each user. This is the best way to provide personalized experiences and make the most powerful actions available to users. A simple example of the OAuth flow with actions will look like the following:
- To start, select "Authentication" in the GPT editor UI, and select "OAuth".
- You will be prompted to enter the OAuth client ID, client secret, authorization URL, token URL, and scope.
- The client ID and secret can be simple text strings but should [follow OAuth best practices](https://www.oauth.com/oauth2-servers/client-registration/client-id-secret/).
- We store an encrypted version of the client secret, while the client ID is available to end users.
- OAuth requests will include the following information: `request={'grant_type': 'authorization_code', 'client_id': 'YOUR_CLIENT_ID', 'client_secret': 'YOUR_CLIENT_SECRET', 'code': 'abc123', 'redirect_uri': 'https://chatgpt.com/aip/g-some_gpt_id/oauth/callback'}`
- In order for someone to use an action with OAuth, they will need to send a message that invokes the action and then the user will be presented with a "Sign in to [domain]" button in the ChatGPT UI.
- The `authorization_url` endpoint should return a response that looks like:
`{ "access_token": "example_token", "token_type": "bearer", "refresh_token": "example_token", "expires_in": 59 }`
- During the user sign in process, ChatGPT makes a request to your `authorization_url` using the specified `authorization_content_type`, we expect to get back an access token and optionally a [refresh token](https://auth0.com/learn/refresh-tokens) which we use to periodically fetch a new access token.
- Each time a user makes a request to the action, the users token will be passed in the Authorization header: (“Authorization”: “[Bearer/Basic] [users token]”).
- We require that OAuth applications make use of the [state parameter](https://auth0.com/docs/secure/attack-protection/state-parameters#set-and-compare-state-parameter-values) for security reasons.

@ -0,0 +1,333 @@
# Batch API
Learn how to use OpenAI's Batch API to send asynchronous groups of requests with 50% lower costs, a separate pool of significantly higher rate limits, and a clear 24-hour turnaround time. The service is ideal for processing jobs that don't require immediate responses. You can also [explore the API reference directly here](/docs/api-reference/batch).
## Overview
While some uses of the OpenAI Platform require you to send synchronous requests, there are many cases where requests do not need an immediate response or [rate limits](/docs/guides/rate-limits) prevent you from executing a large number of queries quickly. Batch processing jobs are often helpful in use cases like:
1. running evaluations
2. classifying large datasets
3. embedding content repositories
The Batch API offers a straightforward set of endpoints that allow you to collect a set of requests into a single file, kick off a batch processing job to execute these requests, query for the status of that batch while the underlying requests execute, and eventually retrieve the collected results when the batch is complete.
Compared to using standard endpoints directly, Batch API has:
1. **Better cost efficiency:** 50% cost discount compared to synchronous APIs
2. **Higher rate limits:** [Substantially more headroom](/settings/organization/limits) compared to the synchronous APIs
3. **Fast completion times:** Each batch completes within 24 hours (and often more quickly)
## Getting Started
### 1. Preparing Your Batch File
Batches start with a `.jsonl` file where each line contains the details of an individual request to the API. For now, the available endpoints are `/v1/chat/completions` ([Chat Completions API](/docs/api-reference/chat)) and `/v1/embeddings` ([Embeddings API](/docs/api-reference/embeddings)). For a given input file, the parameters in each line's `body` field are the same as the parameters for the underlying endpoint. Each request must include a unique `custom_id` value, which you can use to reference results after completion. Here's an example of an input file with 2 requests. Note that each input file can only include requests to a single model.
```jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-3.5-turbo-0125", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
```
### 2. Uploading Your Batch Input File
Similar to our [Fine-tuning API](/docs/guides/fine-tuning/), you must first upload your input file so that you can reference it correctly when kicking off batches. Upload your `.jsonl` file using the [Files API](/docs/api-reference/files).
<CodeSample
title="Upload files for Batch API"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
batch_input_file = client.files.create(
file=open("batchinput.jsonl", "rb"),
purpose="batch"
)
`.trim(),
curl: `
curl https://api.openai.com/v1/files \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F purpose="batch" \\
-F file="@batchinput.jsonl"
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const file = await openai.files.create({
file: fs.createReadStream("batchinput.jsonl"),
purpose: "batch",
});\n
console.log(file);
}\n
main();
`.trim(),
}}
/>
### 3. Creating the Batch
Once you've successfully uploaded your input file, you can use the input File object's ID to create a batch. In this case, let's assume the file ID is `file-abc123`. For now, the completion window can only be set to `24h`. You can also provide custom metadata via an optional `metadata` parameter.
<CodeSample
title="Create the Batch"
defaultLanguage="python"
code={{
python: `
batch_input_file_id = batch_input_file.id\n
client.batches.create(
input_file_id=batch_input_file_id,
endpoint="/v1/chat/completions",
completion_window="24h",
metadata={
"description": "nightly eval job"
}
)
`.trim(),
curl: `
curl https://api.openai.com/v1/batches \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-d '{
"input_file_id": "file-abc123",
"endpoint": "/v1/chat/completions",
"completion_window": "24h"
}'
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const batch = await openai.batches.create({
input_file_id: "file-abc123",
endpoint: "/v1/chat/completions",
completion_window: "24h"
});\n
console.log(batch);
}\n
main();
`.trim(),
}}
/>
This request will return a [Batch object](/docs/api-reference/batch/object) with metadata about your batch:
```python
{
"id": "batch_abc123",
"object": "batch",
"endpoint": "/v1/chat/completions",
"errors": null,
"input_file_id": "file-abc123",
"completion_window": "24h",
"status": "validating",
"output_file_id": null,
"error_file_id": null,
"created_at": 1714508499,
"in_progress_at": null,
"expires_at": 1714536634,
"completed_at": null,
"failed_at": null,
"expired_at": null,
"request_counts": {
"total": 0,
"completed": 0,
"failed": 0
},
"metadata": null
}
```
### 4. Checking the Status of a Batch
You can check the status of a batch at any time, which will also return a Batch object.
<CodeSample
title="Check the status of a batch"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
client.batches.retrieve("batch_abc123")
`.trim(),
curl: `
curl https://api.openai.com/v1/batches/batch_abc123 \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const batch = await openai.batches.retrieve("batch_abc123");\n
console.log(batch);
}\n
main();
`.trim(),
}}
/>
The status of a given Batch object can be any of the following:
| Status | Description |
| ------------- | ------------------------------------------------------------------------------ |
| `validating` | the input file is being validated before the batch can begin |
| `failed` | the input file has failed the validation process |
| `in_progress` | the input file was successfully validated and the batch is currently being run |
| `finalizing` | the batch has completed and the results are being prepared |
| `completed` | the batch has been completed and the results are ready |
| `expired` | the batch was not able to be completed within the 24-hour time window |
| `cancelling` | the batch is being cancelled (may take up to 10 minutes) |
| `cancelled` | the batch was cancelled |
### 5. Retrieving the Results
Once the batch is complete, you can download the output by making a request against the [Files API](/docs/api-reference/files) via the `output_file_id` field from the Batch object and writing it to a file on your machine, in this case `batch_output.jsonl`
<CodeSample
title="Retrieving the batch results"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
content = client.files.content("file-xyz123")
`.trim(),
curl: `
curl https://api.openai.com/v1/files/file-xyz123/content \\
-H "Authorization: Bearer $OPENAI_API_KEY" > batch_output.jsonl
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const file = await openai.files.content("file-xyz123");\n
console.log(file);
}\n
main();
`.trim(),
}}
/>
The output `.jsonl` file will have one response line for every successful request line in the input file. Any failed requests in the batch will have their error information written to an error file that can be found via the batch's `error_file_id`.
Note that the output line order may not match the input line order. Instead of
relying on order to process your results, use the custom_id field which will be
present in each line of your output file and allow you to map requests in your input
to results in your output.
```jsonl
{"id": "batch_req_123", "custom_id": "request-2", "response": {"status_code": 200, "request_id": "req_123", "body": {"id": "chatcmpl-123", "object": "chat.completion", "created": 1711652795, "model": "gpt-3.5-turbo-0125", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello."}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 22, "completion_tokens": 2, "total_tokens": 24}, "system_fingerprint": "fp_123"}}, "error": null}
{"id": "batch_req_456", "custom_id": "request-1", "response": {"status_code": 200, "request_id": "req_789", "body": {"id": "chatcmpl-abc", "object": "chat.completion", "created": 1711652789, "model": "gpt-3.5-turbo-0125", "choices": [{"index": 0, "message": {"role": "assistant", "content": "Hello! How can I assist you today?"}, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 20, "completion_tokens": 9, "total_tokens": 29}, "system_fingerprint": "fp_3ba"}}, "error": null}
```
### 6. Cancelling a Batch
If necessary, you can cancel an ongoing batch. The batch's status will change to `cancelling` until in-flight requests are complete (up to 10 minutes), after which the status will change to `cancelled`.
<CodeSample
title="Cancelling a batch"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
client.batches.cancel("batch_abc123")
`.trim(),
curl: `
curl https://api.openai.com/v1/batches/batch_abc123/cancel \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-X POST
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const batch = await openai.batches.cancel("batch_abc123");\n
console.log(batch);
}\n
main();
`.trim(),
}}
/>
### 7. Getting a List of All Batches
At any time, you can see all your batches. For users with many batches, you can use the `limit` and `after` parameters to paginate your results.
<CodeSample
title="Getting a list of all batches"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
client.batches.list(limit=10)
`.trim(),
curl: `
curl https://api.openai.com/v1/batches?limit=10 \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json"
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const list = await openai.batches.list();\n
for await (const batch of list) {
console.log(batch);
}
}\n
main();
`.trim(),
}}
/>
## Model Availability
The Batch API can currently be used to execute queries against the following models. The Batch API supports text and vision inputs in the same format as the endpoints for these models:
- `gpt-4o`
- `gpt-4-turbo`
- `gpt-4`
- `gpt-4-32k`
- `gpt-3.5-turbo`
- `gpt-3.5-turbo-16k`
- `gpt-4-turbo-preview`
- `gpt-4-vision-preview`
- `gpt-4-turbo-2024-04-09`
- `gpt-4-0314`
- `gpt-4-32k-0314`
- `gpt-4-32k-0613`
- `gpt-3.5-turbo-0301`
- `gpt-3.5-turbo-16k-0613`
- `gpt-3.5-turbo-1106`
- `gpt-3.5-turbo-0613`
- `text-embedding-3-large`
- `text-embedding-3-small`
- `text-embedding-ada-002`
The Batch API also supports [fine-tuned models](/docs/guides/fine-tuning/what-models-can-be-fine-tuned).
## Rate Limits
Batch API rate limits are separate from existing per-model rate limits. The Batch API has two new types of rate limits:
1. **Per-batch limits:** A single batch may include up to 50,000 requests, and a batch input file can be up to 100 MB in size. Note that `/v1/embeddings` batches are also restricted to a maximum of 50,000 embedding inputs across all requests in the batch.
2. **Enqueued prompt tokens per model:** Each model has a maximum number of enqueued prompt tokens allowed for batch processing. You can find these limits on the [Platform Settings page](/settings/organization/limits).
There are no limits for output tokens or number of submitted requests for the Batch API today. Because Batch API rate limits are a new, separate pool, **using the Batch API will not consume tokens from your standard per-model rate limits**, thereby offering you a convenient way to increase the number of requests and processed tokens you can use when querying our API.
## Batch Expiration
Batches that do not complete in time eventually move to an `expired` state; unfinished requests within that batch are cancelled, and any responses to completed requests are made available via the batch's output file. You will be charged for tokens consumed from any completed requests.
## Other Resources
For more concrete examples, visit **[the OpenAI Cookbook](https://cookbook.openai.com/examples/batch_processing)**, which contains sample code for use cases like classification, sentiment analysis, and summary generation.

@ -0,0 +1,393 @@
# Changelog
Keep track of changes to the OpenAI API. You can also track changes via our [public OpenAPI specification](https://github.com/openai/openai-openapi) which is used to generate our SDKs, documentation, and more. This changelog is maintained in a best effort fashion and may not reflect all changes
being made.
### Jun 6th, 2024
-
<MarkdownLink
href="/docs/guides/function-calling/parallel-function-calling"
useSpan
>
Parallel function calling
{" "}
can be disabled in Chat Completions and the Assistants API by passing{" "}
parallel_tool_calls=false.
-
.NET SDK
{" "}
launched in Beta.
### Jun 3rd, 2024
-
Added support for{" "}
<MarkdownLink
href="/docs/assistants/tools/file-search/customizing-file-search-settings"
useSpan
>
file search customizations
.
### May 15th, 2024
-
Added support for{" "}
archiving projects
. Only organization owners can access this functionality.
-
Added support for{" "}
setting cost limits
{" "}
on a per-project basis for pay as you go customers.
### May 13th, 2024
-
Released{" "}
GPT-4o
{" "}
in the API. GPT-4o is our fastest and most affordable flagship model.
### May 9th, 2024
-
Added support for{" "}
<MarkdownLink
href="/docs/assistants/how-it-works/creating-image-input-content"
useSpan
>
image inputs to the Assistants API.
### May 7th, 2024
-
Added support for{" "}
fine-tuned models to the Batch API
.
### May 6th, 2024
-
Added{" "}
<MarkdownLink
href="/docs/api-reference/chat/create#chat-create-stream_options"
useSpan
>
{'`stream_options: {"include_usage": true}`'}
{" "}
parameter to the Chat Completions and Completions APIs. Setting this gives
developers access to usage stats when using streaming.
### May 2nd, 2024
-
Added{" "}
{"a new endpoint"}
{" "}
to delete a message from a thread in the Assistants API.
### Apr 29th, 2024
-
Added a new{" "}
<MarkdownLink
href="/docs/guides/function-calling/function-calling-behavior"
useSpan
>
function calling option `tool_choice: "required"`
{" "}
to the Chat Completions and Assistants APIs.
-
Added a{" "}
guide for the Batch API
{" "}
and Batch API support for{" "}
embeddings models
### Apr 17th, 2024
-
Introduced a{" "}
series of updates to the Assistants API
, including a new file search tool allowing up to 10,000 files per assistant, new token
controls, and support for tool choice.
### Apr 16th, 2024
-
Introduced{" "}
project based hierarchy
{" "}
for organizing work by projects, including the ability to create{" "}
API keys {" "}
and manage rate and cost limits on a per-project basis (cost limits available only
for Enterprise customers).
### Apr 15th, 2024
-
Released{" "}
Batch API
### Apr 9th, 2024
-
Released{" "}
GPT-4 Turbo with Vision
{" "}
in general availability in the API
### Apr 4th, 2024
-
Added support for{" "}
seed
{" "}
in the fine-tuning API
-
Added support for{" "}
checkpoints
{" "}
in the fine-tuning API
-
Added support for{" "}
<MarkdownLink
href="/docs/api-reference/runs/createRun#runs-createrun-additional_messages"
useSpan
>
adding Messages when creating a Run
{" "}
in the Assistants API
### Apr 1st, 2024
-
Added support for{" "}
<MarkdownLink
href="/docs/api-reference/messages/listMessages#messages-listmessages-run_id"
useSpan
>
filtering Messages by run_id
{" "}
in the Assistants API
### Mar 29th, 2024
-
Added support for{" "}
<MarkdownLink
href="/docs/api-reference/runs/createRun#runs-createrun-temperature"
useSpan
>
temperature
{" "}
and{" "}
<MarkdownLink
href="/docs/api-reference/messages/createMessage#messages-createmessage-role"
useSpan
>
assistant message creation
{" "}
in the Assistants API
### Mar 14th, 2024
-
Added support for{" "}
streaming
{" "}
in the Assistants API
### Feb 9th, 2024
-
Added
{" "}
timestamp_granularities parameter
to the Audio API
### Feb 1st, 2024
-
Released
{" "}
gpt-3.5-turbo-0125, an updated GPT-3.5 Turbo model
### Jan 25th, 2024
-
Released
{" "}
embedding V3 models and an updated GPT-4 Turbo preview
-
Added
<MarkdownLink
href="/docs/api-reference/embeddings/create#embeddings-create-dimensions"
useSpan
>
{" "}
dimensions parameter
to the Embeddings API
### Dec 20th, 2023
-
Added
<MarkdownLink
href="/docs/api-reference/runs/createRun#runs-createrun-additional_instructions"
useSpan
>
{" "}
additional_instructions parameter
to run creation in the Assistants API
### Dec 15th, 2023
-
Added
{" "}
logprobs and
top_logprobs
parameters
to the Chat Completions API
### Dec 14th, 2023
-
Changed{" "}
function parameters
{" "}
argument on a tool call to be optional
### Nov 30th, 2023
-
Released{" "}
OpenAI Deno SDK
### Nov 6th, 2023
-
Released{" "}
GPT-4 Turbo Preview
,
updated GPT-3.5 Turbo
,
GPT-4 Turbo with Vision
,
Assistants API
,
DALL·E 3 in the API
, and
text-to-speech API
-
Deprecated the Chat Completions functions{" "}
parameter{" "}
in favor of
tools
{" "}
-
Released{" "}
OpenAI Python SDK V1.0
### Oct 16th, 2023
-
Added
<MarkdownLink
href="/docs/api-reference/embeddings/create#embeddings-create-encoding_format"
useSpan
>
{" "}
encoding_format parameter
to the Embeddings API
-
Added max_tokens to the{" "}
Moderation models
### Oct 6th, 2023
-
Added{" "}
function calling support
{" "}
to the Fine-tuning API

@ -0,0 +1,560 @@
# How to build an AI that can answer questions about your website
This tutorial walks through a simple example of crawling a website (in this example, the OpenAI website), turning the crawled pages into embeddings using the [Embeddings API](/docs/guides/embeddings), and then creating a basic search functionality that allows a user to ask questions about the embedded information. This is intended to be a starting point for more sophisticated applications that make use of custom knowledge bases.
# Getting started
Some basic knowledge of Python and GitHub is helpful for this tutorial. Before diving in, make sure to [set up an OpenAI API key](/docs/api-reference/introduction) and walk through the [quickstart tutorial](/docs/quickstart). This will give a good intuition on how to use the API to its full potential.
Python is used as the main programming language along with the OpenAI, Pandas, transformers, NumPy, and other popular packages. If you run into any issues working through this tutorial, please ask a question on the [OpenAI Community Forum](https://community.openai.com).
To start with the code, clone the [full code for this tutorial on GitHub](https://github.com/openai/web-crawl-q-and-a-example). Alternatively, follow along and copy each section into a Jupyter notebook and run the code step by step, or just read along. A good way to avoid any issues is to set up a new virtual environment and install the required packages by running the following commands:
```bash
python -m venv env
source env/bin/activate
pip install -r requirements.txt
```
## Setting up a web crawler
The primary focus of this tutorial is the OpenAI API so if you prefer, you can skip the context on how to create a web crawler and just [download the source code](https://github.com/openai/web-crawl-q-and-a-example). Otherwise, expand the section below to work through the scraping mechanism implementation.
<Image
png="https://cdn.openai.com/API/docs/images/tutorials/web-qa/DALL-E-coding-a-web-crawling-system-pixel-art.png"
webp="https://cdn.openai.com/API/docs/images/tutorials/web-qa/DALL-E-coding-a-web-crawling-system-pixel-art.webp"
alt="DALL-E: Coding a web crawling system pixel art"
width="1024"
height="1024"
/>
Acquiring data in text form is the first step to use embeddings. This tutorial
creates a new set of data by crawling the OpenAI website, a technique that you
can also use for your own company or personal website.
<Button
size="small"
color={ButtonColor.neutral}
href="https://github.com/openai/web-crawl-q-and-a-example"
target="_blank"
>
View source code
While this crawler is written from scratch, open source packages like [Scrapy](https://github.com/scrapy/scrapy) can also help with these operations.
This crawler will start from the root URL passed in at the bottom of the code below, visit each page, find additional links, and visit those pages as well (as long as they have the same root domain). To begin, import the required packages, set up the basic URL, and define a HTMLParser class.
```python
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
# Regex pattern to match a URL
HTTP_URL_PATTERN = r'^http[s]*://.+'
domain = "openai.com" # <- put your domain to be crawled
full_url = "https://openai.com/" # <- put your domain to be crawled with https or http
# Create a class to parse the HTML and get the hyperlinks
class HyperlinkParser(HTMLParser):
def __init__(self):
super().__init__()
# Create a list to store the hyperlinks
self.hyperlinks = []
# Override the HTMLParser's handle_starttag method to get the hyperlinks
def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
# If the tag is an anchor tag and it has an href attribute, add the href attribute to the list of hyperlinks
if tag == "a" and "href" in attrs:
self.hyperlinks.append(attrs["href"])
```
The next function takes a URL as an argument, opens the URL, and reads the HTML content. Then, it returns all the hyperlinks found on that page.
```python
# Function to get the hyperlinks from a URL
def get_hyperlinks(url):
# Try to open the URL and read the HTML
try:
# Open the URL and read the HTML
with urllib.request.urlopen(url) as response:
# If the response is not HTML, return an empty list
if not response.info().get('Content-Type').startswith("text/html"):
return []
# Decode the HTML
html = response.read().decode('utf-8')
except Exception as e:
print(e)
return []
# Create the HTML Parser and then Parse the HTML to get hyperlinks
parser = HyperlinkParser()
parser.feed(html)
return parser.hyperlinks
```
The goal is to crawl through and index only the content that lives under the OpenAI domain. For this purpose, a function that calls the `get_hyperlinks` function but filters out any URLs that are not part of the specified domain is needed.
```python
# Function to get the hyperlinks from a URL that are within the same domain
def get_domain_hyperlinks(local_domain, url):
clean_links = []
for link in set(get_hyperlinks(url)):
clean_link = None
# If the link is a URL, check if it is within the same domain
if re.search(HTTP_URL_PATTERN, link):
# Parse the URL and check if the domain is the same
url_obj = urlparse(link)
if url_obj.netloc == local_domain:
clean_link = link
# If the link is not a URL, check if it is a relative link
else:
if link.startswith("/"):
link = link[1:]
elif link.startswith("#") or link.startswith("mailto:"):
continue
clean_link = "https://" + local_domain + "/" + link
if clean_link is not None:
if clean_link.endswith("/"):
clean_link = clean_link[:-1]
clean_links.append(clean_link)
# Return the list of hyperlinks that are within the same domain
return list(set(clean_links))
```
The `crawl` function is the final step in the web scraping task setup. It keeps track of the visited URLs to avoid repeating the same page, which might be linked across multiple pages on a site. It also extracts the raw text from a page without the HTML tags, and writes the text content into a local .txt file specific to the page.
```python
def crawl(url):
# Parse the URL and get the domain
local_domain = urlparse(url).netloc
# Create a queue to store the URLs to crawl
queue = deque([url])
# Create a set to store the URLs that have already been seen (no duplicates)
seen = set([url])
# Create a directory to store the text files
if not os.path.exists("text/"):
os.mkdir("text/")
if not os.path.exists("text/"+local_domain+"/"):
os.mkdir("text/" + local_domain + "/")
# Create a directory to store the csv files
if not os.path.exists("processed"):
os.mkdir("processed")
# While the queue is not empty, continue crawling
while queue:
# Get the next URL from the queue
url = queue.pop()
print(url) # for debugging and to see the progress
# Save text from the url to a .txt file
with open('text/'+local_domain+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
# Get the text from the URL using BeautifulSoup
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# Get the text but remove the tags
text = soup.get_text()
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
if ("You need to enable JavaScript to run this app." in text):
print("Unable to parse page " + url + " due to JavaScript being required")
# Otherwise, write the text to the file in the text directory
f.write(text)
# Get the hyperlinks from the URL and add them to the queue
for link in get_domain_hyperlinks(local_domain, url):
if link not in seen:
queue.append(link)
seen.add(link)
crawl(full_url)
```
The last line of the above example runs the crawler which goes through all the accessible links and turns those pages into text files. This will take a few minutes to run depending on the size and complexity of your site.
## Building an embeddings index
<Image
png="https://cdn.openai.com/API/docs/images/tutorials/web-qa/DALL-E-woman-turning-a-stack-of-papers-into-numbers-pixel-art.png"
webp="https://cdn.openai.com/API/docs/images/tutorials/web-qa/DALL-E-woman-turning-a-stack-of-papers-into-numbers-pixel-art.webp"
alt="DALL-E: Woman turning a stack of papers into numbers pixel art"
width="1024"
height="1024"
/>
CSV is a common format for storing embeddings. You can use this format with
Python by converting the raw text files (which are in the text directory) into
Pandas data frames. Pandas is a popular open source library that helps you
work with tabular data (data stored in rows and columns).
Blank empty lines can clutter the text files and make them harder to process.
A simple function can remove those lines and tidy up the files.
```python
def remove_newlines(serie):
serie = serie.str.replace('\n', ' ')
serie = serie.str.replace('\\n', ' ')
serie = serie.str.replace(' ', ' ')
serie = serie.str.replace(' ', ' ')
return serie
```
Converting the text to CSV requires looping through the text files in the text directory created earlier. After opening each file, remove the extra spacing and append the modified text to a list. Then, add the text with the new lines removed to an empty Pandas data frame and write the data frame to a CSV file.
Extra spacing and new lines can clutter the text and complicate the embeddings
process. The code used here helps to remove some of them but you may find 3rd party
libraries or other methods useful to get rid of more unnecessary characters.
```python
import pandas as pd
# Create a list to store the text files
texts=[]
# Get all the text files in the text directory
for file in os.listdir("text/" + domain + "/"):
# Open the file and read the text
with open("text/" + domain + "/" + file, "r", encoding="UTF-8") as f:
text = f.read()
# Omit the first 11 lines and the last 4 lines, then replace -, _, and #update with spaces.
texts.append((file[11:-4].replace('-',' ').replace('_', ' ').replace('#update',''), text))
# Create a dataframe from the list of texts
df = pd.DataFrame(texts, columns = ['fname', 'text'])
# Set the text column to be the raw text with the newlines removed
df['text'] = df.fname + ". " + remove_newlines(df.text)
df.to_csv('processed/scraped.csv')
df.head()
```
Tokenization is the next step after saving the raw text into a CSV file. This process splits the input text into tokens by breaking down the sentences and words. A visual demonstration of this can be seen by [checking out our Tokenizer](/tokenizer) in the docs.
> A helpful rule of thumb is that one token generally corresponds to ~4 characters of text for common English text. This translates to roughly ¾ of a word (so 100 tokens ~= 75 words).
The API has a limit on the maximum number of input tokens for embeddings. To stay below the limit, the text in the CSV file needs to be broken down into multiple rows. The existing length of each row will be recorded first to identify which rows need to be split.
```python
import tiktoken
# Load the cl100k_base tokenizer which is designed to work with the ada-002 model
tokenizer = tiktoken.get_encoding("cl100k_base")
df = pd.read_csv('processed/scraped.csv', index_col=0)
df.columns = ['title', 'text']
# Tokenize the text and save the number of tokens to a new column
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
# Visualize the distribution of the number of tokens per row using a histogram
df.n_tokens.hist()
```
<img
src="https://cdn.openai.com/API/docs/images/tutorials/web-qa/embeddings-initial-histrogram.png"
alt="Embeddings histogram"
width="553"
height="413"
/>
The newest embeddings model can handle inputs with up to 8191 input tokens so most of the rows would not need any chunking, but this may not be the case for every subpage scraped so the next code chunk will split the longer lines into smaller chunks.
```Python
max_tokens = 500
# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):
# Split the text into sentences
sentences = text.split('. ')
# Get the number of tokens for each sentence
n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
chunks = []
tokens_so_far = 0
chunk = []
# Loop through the sentences and tokens joined together in a tuple
for sentence, token in zip(sentences, n_tokens):
# If the number of tokens so far plus the number of tokens in the current sentence is greater
# than the max number of tokens, then add the chunk to the list of chunks and reset
# the chunk and tokens so far
if tokens_so_far + token > max_tokens:
chunks.append(". ".join(chunk) + ".")
chunk = []
tokens_so_far = 0
# If the number of tokens in the current sentence is greater than the max number of
# tokens, go to the next sentence
if token > max_tokens:
continue
# Otherwise, add the sentence to the chunk and add the number of tokens to the total
chunk.append(sentence)
tokens_so_far += token + 1
return chunks
shortened = []
# Loop through the dataframe
for row in df.iterrows():
# If the text is None, go to the next row
if row[1]['text'] is None:
continue
# If the number of tokens is greater than the max number of tokens, split the text into chunks
if row[1]['n_tokens'] > max_tokens:
shortened += split_into_many(row[1]['text'])
# Otherwise, add the text to the list of shortened texts
else:
shortened.append( row[1]['text'] )
```
Visualizing the updated histogram again can help to confirm if the rows were successfully split into shortened sections.
```python
df = pd.DataFrame(shortened, columns = ['text'])
df['n_tokens'] = df.text.apply(lambda x: len(tokenizer.encode(x)))
df.n_tokens.hist()
```
<img
src="https://cdn.openai.com/API/docs/images/tutorials/web-qa/embeddings-tokenized-output.png"
alt="Embeddings tokenized output"
width="552"
height="418"
/>
The content is now broken down into smaller chunks and a simple request can be sent to the OpenAI API specifying the use of the new text-embedding-ada-002 model to create the embeddings:
```python
from openai import OpenAI
client = OpenAI(
api_key=os.environ.get("OPENAI_API_KEY"),
)
df['embeddings'] = df.text.apply(lambda x: client.embeddings.create(input=x, engine='text-embedding-ada-002')['data'][0]['embedding'])
df.to_csv('processed/embeddings.csv')
df.head()
```
This should take about 3-5 minutes but after you will have your embeddings ready to use!
## Building a question answer system with your embeddings
<Image
png="https://cdn.openai.com/API/docs/images/tutorials/web-qa/DALL-E-friendly-robot-question-and-answer-system-pixel-art.png"
webp="https://cdn.openai.com/API/docs/images/tutorials/web-qa/DALL-E-friendly-robot-question-and-answer-system-pixel-art.webp"
alt="DALL-E: Friendly robot question and answer system pixel art"
width="1024"
height="1024"
/>
The embeddings are ready and the final step of this process is to create a
simple question and answer system. This will take a user's question, create an
embedding of it, and compare it with the existing embeddings to retrieve the
most relevant text from the scraped website. The gpt-3.5-turbo-instruct model
will then generate a natural sounding answer based on the retrieved text.
---
Turning the embeddings into a NumPy array is the first step, which will provide more flexibility in how to use it given the many functions available that operate on NumPy arrays. It will also flatten the dimension to 1-D, which is the required format for many subsequent operations.
```python
import numpy as np
from openai.embeddings_utils import distances_from_embeddings
df=pd.read_csv('processed/embeddings.csv', index_col=0)
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)
df.head()
```
The question needs to be converted to an embedding with a simple function, now that the data is ready. This is important because the search with embeddings compares the vector of numbers (which was the conversion of the raw text) using cosine distance. The vectors are likely related and might be the answer to the question if they are close in cosine distance. The OpenAI python package has a built in `distances_from_embeddings` function which is useful here.
```python
def create_context(
question, df, max_len=1800, size="ada"
):
"""
Create a context for a question by finding the most similar context from the dataframe
"""
# Get the embeddings for the question
q_embeddings = client.embeddings.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']
# Get the distances from the embeddings
df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')
returns = []
cur_len = 0
# Sort by distance and add the text to the context until the context is too long
for i, row in df.sort_values('distances', ascending=True).iterrows():
# Add the length of the text to the current length
cur_len += row['n_tokens'] + 4
# If the context is too long, break
if cur_len > max_len:
break
# Else add it to the text that is being returned
returns.append(row["text"])
# Return the context
return "\n\n###\n\n".join(returns)
```
The text was broken up into smaller sets of tokens, so looping through in ascending order and continuing to add the text is a critical step to ensure a full answer. The max_len can also be modified to something smaller, if more content than desired is returned.
The previous step only retrieved chunks of texts that are semantically related to the question, so they might contain the answer, but there's no guarantee of it. The chance of finding an answer can be further increased by returning the top 5 most likely results.
The answering prompt will then try to extract the relevant facts from the retrieved contexts, in order to formulate a coherent answer. If there is no relevant answer, the prompt will return “I dont know”.
A realistic sounding answer to the question can be created with the completion endpoint using `gpt-3.5-turbo-instruct`.
```python
def answer_question(
df,
model="gpt-3.5-turbo",
question="Am I allowed to publish model outputs to Twitter, without a human review?",
max_len=1800,
size="ada",
debug=False,
max_tokens=150,
stop_sequence=None
):
"""
Answer a question based on the most similar context from the dataframe texts
"""
context = create_context(
question,
df,
max_len=max_len,
size=size,
)
# If debug, print the raw model response
if debug:
print("Context:\n" + context)
print("\n\n")
try:
# Create a chat completion using the question and context
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
{"role": "user", f"content": "Context: {context}\n\n---\n\nQuestion: {question}\nAnswer:"}
],
temperature=0,
max_tokens=max_tokens,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
stop=stop_sequence,
)
return response.choices[0].message.strip()
except Exception as e:
print(e)
return ""
```
It is done! A working Q/A system that has the knowledge embedded from the OpenAI website is now ready. A few quick tests can be done to see the quality of the output:
```python
answer_question(df, question="What day is it?", debug=False)
answer_question(df, question="What is our newest embeddings model?")
answer_question(df, question="What is ChatGPT?")
```
The responses will look something like the following:
```response
"I don't know."
'The newest embeddings model is text-embedding-ada-002.'
'ChatGPT is a model trained to interact in a conversational way. It is able to answer followup questions, admit its mistakes, challenge incorrect premises, and reject inappropriate requests.'
```
If the system is not able to answer a question that is expected, it is worth searching through the raw text files to see if the information that is expected to be known actually ended up being embedded or not. The crawling process that was done initially was setup to skip sites outside the original domain that was provided, so it might not have that knowledge if there was a subdomain setup.
Currently, the dataframe is being passed in each time to answer a question. For more production workflows, a [vector database solution](/docs/guides/embeddings/how-can-i-retrieve-k-nearest-embedding-vectors-quickly) should be used instead of storing the embeddings in a CSV file, but the current approach is a great option for prototyping.

@ -0,0 +1,118 @@
Curl is a popular command line tool used by developers to send HTTP requests to API's. It requires minimal set up time but is less capable than a fully featured programming language like Python or JavaScript.
## Step 1: Set up curl
Many operating systems ship curl by default. You can check if you have curl installed by opening your Terminal or Command line, then entering the command:
`curl https://platform.openai.com`
If curl is set up and you are connected to the internet, it will send a HTTP request to fetch the contents of "platform.openai.com".
If you get an error that curl is not found, you can install it by following the instructions on the [curl homepage](https://everything.curl.dev/get).
## Step 2: Set up your API key
Now that we have curl working, the next step is to set up an API key in your Terminal or Command line. You can optionally skip this step and just include your API key in your request as is talked about in Step 3.
1. **Open Terminal**: You can find it in the Applications folder or search for it using Spotlight (Command + Space).
2. **Edit bash profile**: Use the command `nano ~/.bash_profile` or `nano ~/.zshrc` (for newer MacOS versions) to open the profile file in a text editor.
3. **Add environment variable**: In the editor, add the line below, replacing `your-api-key-here` with your actual API key:
```
export OPENAI_API_KEY='your-api-key-here'
```
4. **Save and exit**: Press Ctrl+O to write the changes, followed by Ctrl+X to close the editor.
5. **Load your profile**: Use the command `source ~/.bash_profile` or `source ~/.zshrc` to load the updated profile.
6. **Verification**: Verify the setup by typing `echo $OPENAI_API_KEY` in the terminal. It should display your API key.
1. **Open Command Prompt**: You can find it by searching "cmd" in the start menu.
2. **Set environment variable in the current session**: To set the environment variable in the current session, use the command below, replacing `your-api-key-here` with your actual API key:
```
setx OPENAI_API_KEY "your-api-key-here"
```
This command will set the OPENAI_API_KEY environment variable for the current session.
3. **Permanent setup**: To make the setup permanent, add the variable through the system properties as follows:
- Right-click on 'This PC' or 'My Computer' and select 'Properties'.
- Click on 'Advanced system settings'.
- Click the 'Environment Variables' button.
- In the 'System variables' section, click 'New...' and enter OPENAI_API_KEY as the variable name and your API key as the variable value.
4. **Verification**: To verify the setup, reopen the command prompt and type the command below. It should display your API key:
```
echo %OPENAI_API_KEY%
```
## Step 3: Sending your first API request
Once you have an API key set up, the final step is to send your first API request. To do so, a sample request to the [Chat Completions](/docs/api-reference/chat/create), [Embeddings](/docs/api-reference/embeddings/create), and [Images](/docs/api-reference/images/create) API are included below. Because the API key was set in Step 2, it should be automatically referenced via `$OPENAI_API_KEY` in your terminal or command line. You can also manually replace `$OPENAI_API_KEY` with your API key but be sure to keep the curl command hidden if it includes your API key.
<CodeSample
defaultLanguage="ChatCompletions"
code={{
ChatCompletions: `
curl https://api.openai.com/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."
},
{
"role": "user",
"content": "Compose a poem that explains the concept of recursion in programming."
}
]
}'
`.trim(),
Embeddings: `
curl https://api.openai.com/v1/embeddings \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"input": "The food was delicious and the waiter...",
"model": "text-embedding-ada-002"
}'
`.trim(),
Images: `
curl https://api.openai.com/v1/images/generations \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-d '{
"prompt": "A cute baby sea otter",
"n": 2,
"size": "1024x1024"
}'
`.trim(),
}}
/>
The [Chat Completions](/docs/api-reference/chat/create) example highlights just one area of strength for our models: creative ability. Explaining recursion (the programming topic) in a well formatted poem is something both the best developers and best poets would struggle with. In this case, `gpt-3.5-turbo` does it effortlessly.

@ -0,0 +1,75 @@
# Data retrieval with GPT Actions
One of the most common tasks an action in a GPT can perform is data retrieval. An action might:
1. Access an API to retrieve data based on a keyword search
2. Access a relational database to retrieve records based on a structured query
3. Access a vector database to retrieve text chunks based on semantic search
Well explore considerations specific to the various types of retrieval integrations in this guide.
## Data retrieval using APIs
Many organizations rely on 3rd party software to store important data. Think Salesforce for customer data, Zendesk for support data, Confluence for internal process data, and Google Drive for business documents. These providers often provide REST APIs which enable external systems to search for and retrieve information.
When building an action to integrate with a provider's REST API, start by reviewing the existing documentation. Youll need to confirm a few things:
1. Retrieval methods
- **Search** - Each provider will support different search semantics, but generally you want a method which takes a keyword or query string and returns a list of matching documents. See [Google Drives `file.list` method](https://developers.google.com/drive/api/guides/search-files) for an example.
- **Get** - Once youve found matching documents, you need a way to retrieve them. See [Google Drives `file.get` method](https://developers.google.com/drive/api/reference/rest/v3/files/get) for an example.
2. Authentication scheme
- For example, [Google Drive uses OAuth](https://developers.google.com/workspace/guides/configure-oauth-consent) to authenticate users and ensure that only their available files are available for retrieval.
3. OpenAPI spec
- Some providers will provide an OpenAPI spec document which you can import directly into your action. See [Zendesk](https://developer.zendesk.com/api-reference/ticketing/introduction/#download-openapi-file), for an example.
- You may want to remove references to methods your GPT _wont_ access, which constrains the actions your GPT can perform.
- For providers who _dont_ provide an OpenAPI spec document, you can create your own using the [ActionsGPT](https://chatgpt.com/g/g-TYEliDU6A-actionsgpt) (a GPT developed by OpenAI).
Your goal is to get the GPT to use the action to search for and retrieve documents containing context which are relevant to the users prompt. Your GPT follows your instructions to use the provided search and get methods to achieve this goal.
## Data retrieval using Relational Databases
Organizations use relational databases to store a variety of records pertaining to their business. These records can contain useful context that will help improve your GPTs responses. For example, lets say you are building a GPT to help users understand the status of an insurance claim. If the GPT can look up claims in a relational database based on a claims number, the GPT will be much more useful to the user.
When building an action to integrate with a relational database, there are a few things to keep in mind:
1. Availability of REST APIs
- Many relational databases do not natively expose a REST API for processing queries. In that case, you may need to build or buy middleware which can sit between your GPT and the database.
- This middleware should do the following:
- Accept a formal query string
- Pass the query string to the database
- Respond back to the requester with the returned records
2. Accessibility from the public internet
- Unlike APIs which are designed to be accessed from the public internet, relational databases are traditionally designed to be used within an organizations application infrastructure. Because GPTs are hosted on OpenAIs infrastructure, youll need to make sure that any APIs you expose are accessible outside of your firewall.
3. Complex query strings
- Relational databases uses formal query syntax like SQL to retrieve relevant records. This means that you need to provide additional instructions to the GPT indicating which query syntax is supported. The good news is that GPTs are usually very good at generating formal queries based on user input.
4. Database permissions
- Although databases support user-level permissions, it is likely that your end users wont have permission to access the database directly. If you opt to use a service account to provide access, consider giving the service account read-only permissions. This can avoid inadvertently overwriting or deleting existing data.
Your goal is to get the GPT to write a formal query related to the users prompt, submit the query via the action, and then use the returned records to augment the response.
## Data retrieval using Vector Databases
If you want to equip your GPT with the most relevant search results, you might consider integrating your GPT with a vector database which supports semantic search as described above. There are many managed and self hosted solutions available on the market, [see here for a partial list](https://github.com/openai/chatgpt-retrieval-plugin#choosing-a-vector-database).
When building an action to integrate with a vector database, there are a few things to keep in mind:
1. Availability of REST APIs
- Many relational databases do not natively expose a REST API for processing queries. In that case, you may need to build or buy middleware which can sit between your GPT and the database (more on middleware below).
2. Accessibility from the public internet
- Unlike APIs which are designed to be accessed from the public internet, relational databases are traditionally designed to be used within an organizations application infrastructure. Because GPTs are hosted on OpenAIs infrastructure, youll need to make sure that any APIs you expose are accessible outside of your firewall.
3. Query embedding
- As discussed above, vector databases typically accept a vector embedding (as opposed to plain text) as query input. This means that you need to use an embedding API to convert the query input into a vector embedding before you can submit it to the vector database. This conversion is best handled in the REST API gateway, so that the GPT can submit a plaintext query string.
4. Database permissions
- Because vector databases store text chunks as opposed to full documents, it can be difficult to maintain user permissions which might have existed on the original source documents. Remember that any user who can access your GPT will have access to all of the text chunks in the database and plan accordingly.
### Middleware for vector databases
As described above, middleware for vector databases typically needs to do two things:
1. Expose access to the vector database via a REST API
2. Convert plaintext query strings into vector embeddings
![Middleware for vector databases](https://cdn.openai.com/API/docs/images/actions-db-diagram.webp)
The goal is to get your GPT to submit a relevant query to a vector database to trigger a semantic search, and then use the returned text chunks to augment the response.

@ -0,0 +1,162 @@
# Deprecations
## Overview
As we launch safer and more capable models, we regularly retire older models. Software relying on OpenAI models may need occasional updates to keep working. Impacted customers will always be notified by email and in our documentation along with [blog posts](https://openai.com/blog) for larger changes.
This page lists all API deprecations, along with recommended replacements.
## Deprecation vs Legacy
We use the term "deprecation" to refer to the process of retiring a model or endpoint. When we announce that a model or endpoint is being deprecated, it immediately becomes deprecated. All deprecated models and endpoints will also have a shut down date. At the time of the shut down, the model or endpoint will no longer be accessible.
We use the term "legacy" to refer to models and endpoints that will no longer receive updates. We tag endpoints and models as legacy to signal to developers where we are moving as a platform and that they should likely migrate to newer models or endpoints. You can expect that a legacy model or endpoint will be deprecated at some point in the future.
## Incremental model updates
As [announced](https://openai.com/blog/introducing-chatgpt-and-whisper-apis#:~:text=Chat%20guide.-,ChatGPT%20upgrades,-We%20are%20constantly) in March 2023, we regularly release new versions of `gpt-4` and `gpt-3.5-turbo`.
Each model version is dated with either `-MMDD` or a `YYYY-MM-DD` suffix; e.g., `gpt-4-0613` or `gpt-4o-2024-05-13`. The undated model name, e.g., `gpt-4o`, will typically point to the latest version dated version. Users of undated model names will be notified by email typically 2 weeks before any change takes place.
## Migrating to replacements
Once a model is deprecated, be sure to migrate all usage to a suitable replacement before the shutdown date. Requests to models past the shutdown date will fail.
To help measure the performance of replacement models on your tasks, we have open-sourced [Evals](https://github.com/openai/evals), a Python framework for evaluating language models.
If new models perform worse on your tasks, let us know by submitting a [pull request](https://github.com/openai/evals/pulls) to our Evals repo with examples of the task.
## Deprecation history
All deprecations are listed below, with the most recent announcements at the top.
### 2024-06-06: GPT-4-32K and Vision Preview models
On June 6th, 2024, we notified developers using `gpt-4-32k` and `gpt-4-vision-preview` of their upcoming deprecations in one year and six months respectively. As of June 17, 2024, only existing users of these models will be able to continue using them.
| Shutdown date | Deprecated model | Deprecated model price | Recommended replacement |
| ------------- | --------------------------- | -------------------------------------------------- | ----------------------- |
| 2025-06-06 | `gpt-4-32k` | $60.00 / 1M input tokens + $120 / 1M output tokens | `gpt-4o` |
| 2025-06-06 | `gpt-4-32k-0613` | $60.00 / 1M input tokens + $120 / 1M output tokens | `gpt-4o` |
| 2025-06-06 | `gpt-4-32k-0314` | $60.00 / 1M input tokens + $120 / 1M output tokens | `gpt-4o` |
| 2024-12-06 | `gpt-4-vision-preview` | $10.00 / 1M input tokens + $30 / 1M output tokens | `gpt-4o` |
| 2024-12-06 | `gpt-4-1106-vision-preview` | $10.00 / 1M input tokens + $30 / 1M output tokens | `gpt-4o` |
### 2023-11-06: Chat model updates
On November 6th, 2023, we [announced](https://openai.com/blog/new-models-and-developer-products-announced-at-devday) the release of an updated GPT-3.5-Turbo model (which now comes by default with 16k context) along with deprecation of `gpt-3.5-turbo-0613` and ` gpt-3.5-turbo-16k-0613`. As of June 17, 2024, only existing users of these models will be able to continue using them.
| Shutdown date | Deprecated model | Deprecated model price | Recommended replacement |
| ------------- | ------------------------ | -------------------------------------------------- | ----------------------- |
| 2024-09-13 | `gpt-3.5-turbo-0613` | $1.50 / 1M input tokens + $2.00 / 1M output tokens | `gpt-3.5-turbo` |
| 2024-09-13 | `gpt-3.5-turbo-16k-0613` | $3.00 / 1M input tokens + $4.00 / 1M output tokens | `gpt-3.5-turbo` |
Fine-tuned models created from these base models are not effected by this deprecation, but you will no longer be able to create new fine-tuned versions with these models.
### 2023-08-22: Fine-tunes endpoint
On August 22nd, 2023, we [announced](https://openai.com/blog/gpt-3-5-turbo-fine-tuning-and-api-updates) the new fine-tuning API (`/v1/fine_tuning/jobs`) and that the original `/v1/fine-tunes` API along with legacy models (including those fine-tuned with the `/v1/fine-tunes` API) will be shut down on January 04, 2024. This means that models fine-tuned using the `/v1/fine-tunes` API will no longer be accessible and you would have to fine-tune new models with the updated endpoint and associated base models.
#### Fine-tunes endpoint
| Shutdown date | System | Recommended replacement |
| ------------- | ---------------- | ----------------------- |
| 2024-01-04 | `/v1/fine-tunes` | `/v1/fine_tuning/jobs` |
### 2023-07-06: GPT and embeddings
On July 06, 2023, we [announced](https://openai.com/blog/gpt-4-api-general-availability) the upcoming retirements of older GPT-3 and GPT-3.5 models served via the completions endpoint. We also announced the upcoming retirement of our first-generation text embedding models. They will be shut down on January 04, 2024.
#### InstructGPT models
| Shutdown date | Deprecated model | Deprecated model price | Recommended replacement |
| ------------- | ------------------ | ---------------------- | ------------------------ |
| 2024-01-04 | `text-ada-001` | $0.40 / 1M tokens | `gpt-3.5-turbo-instruct` |
| 2024-01-04 | `text-babbage-001` | $0.50 / 1M tokens | `gpt-3.5-turbo-instruct` |
| 2024-01-04 | `text-curie-001` | $2.00 / 1M tokens | `gpt-3.5-turbo-instruct` |
| 2024-01-04 | `text-davinci-001` | $20.00 / 1M tokens | `gpt-3.5-turbo-instruct` |
| 2024-01-04 | `text-davinci-002` | $20.00 / 1M tokens | `gpt-3.5-turbo-instruct` |
| 2024-01-04 | `text-davinci-003` | $20.00 / 1M tokens | `gpt-3.5-turbo-instruct` |
Pricing for the replacement `gpt-3.5-turbo-instruct` model can be found on the [pricing page](https://openai.com/pricing).
#### Base GPT models
| Shutdown date | Deprecated model | Deprecated model price | Recommended replacement |
| ------------- | ------------------ | ---------------------- | ------------------------ |
| 2024-01-04 | `ada` | $0.40 / 1M tokens | `babbage-002` |
| 2024-01-04 | `babbage` | $0.50 / 1M tokens | `babbage-002` |
| 2024-01-04 | `curie` | $2.00 / 1M tokens | `davinci-002` |
| 2024-01-04 | `davinci` | $20.00 / 1M tokens | `davinci-002` |
| 2024-01-04 | `code-davinci-002` | --- | `gpt-3.5-turbo-instruct` |
Pricing for the replacement `babbage-002` and `davinci-002` models can be found on the [pricing page](https://openai.com/pricing).
#### Edit models & endpoint
| Shutdown date | Model / system | Recommended replacement |
| ------------- | ----------------------- | ----------------------- |
| 2024-01-04 | `text-davinci-edit-001` | `gpt-4o` |
| 2024-01-04 | `code-davinci-edit-001` | `gpt-4o` |
| 2024-01-04 | `/v1/edits` | `/v1/chat/completions` |
#### Fine-tuning GPT models
| Shutdown date | Deprecated model | Training price | Usage price | Recommended replacement |
| ------------- | ---------------- | ------------------ | ------------------- | ---------------------------------------- |
| 2024-01-04 | `ada` | $0.40 / 1M tokens | $1.60 / 1M tokens | `babbage-002` |
| 2024-01-04 | `babbage` | $0.60 / 1M tokens | $2.40 / 1M tokens | `babbage-002` |
| 2024-01-04 | `curie` | $3.00 / 1M tokens | $12.00 / 1M tokens | `davinci-002` |
| 2024-01-04 | `davinci` | $30.00 / 1M tokens | $120.00 / 1K tokens | `davinci-002`, `gpt-3.5-turbo`, `gpt-4o` |
#### First-generation text embedding models
| Shutdown date | Deprecated model | Deprecated model price | Recommended replacement |
| ------------- | ------------------------------- | ---------------------- | ------------------------ |
| 2024-01-04 | `text-similarity-ada-001` | $4.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-ada-doc-001` | $4.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-ada-query-001` | $4.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `code-search-ada-code-001` | $4.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `code-search-ada-text-001` | $4.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-similarity-babbage-001` | $5.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-babbage-doc-001` | $5.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-babbage-query-001` | $5.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `code-search-babbage-code-001` | $5.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `code-search-babbage-text-001` | $5.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-similarity-curie-001` | $20.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-curie-doc-001` | $20.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-curie-query-001` | $20.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-similarity-davinci-001` | $200.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-davinci-doc-001` | $200.00 / 1M tokens | `text-embedding-3-small` |
| 2024-01-04 | `text-search-davinci-query-001` | $200.00 / 1M tokens | `text-embedding-3-small` |
### 2023-06-13: Updated chat models
On June 13, 2023, we announced new chat model versions in the [Function calling and other API updates](https://openai.com/blog/function-calling-and-other-api-updates) blog post. The three original versions will be retired in June 2024 at the earliest. As of January 10, 2024, only existing users of these models will be able to continue using them.
| Shutdown date | Legacy model | Legacy model price | Recommended replacement |
| ---------------------- | ------------ | ---------------------------------------------------- | ----------------------- |
| at earliest 2024-06-13 | `gpt-4-0314` | $30.00 / 1M input tokens + $60.00 / 1M output tokens | `gpt-4o` |
| Shutdown date | Deprecated model | Deprecated model price | Recommended replacement |
| ------------- | -------------------- | ----------------------------------------------------- | ----------------------- |
| 2024-09-13 | `gpt-3.5-turbo-0301` | $15.00 / 1M input tokens + $20.00 / 1M output tokens | `gpt-3.5-turbo` |
| 2025-06-06 | `gpt-4-32k-0314` | $60.00 / 1M input tokens + $120.00 / 1M output tokens | `gpt-4o` |
### 2023-03-20: Codex models
| Shutdown date | Deprecated model | Recommended replacement |
| ------------- | ------------------ | ----------------------- |
| 2023-03-23 | `code-davinci-002` | `gpt-4o` |
| 2023-03-23 | `code-davinci-001` | `gpt-4o` |
| 2023-03-23 | `code-cushman-002` | `gpt-4o` |
| 2023-03-23 | `code-cushman-001` | `gpt-4o` |
### 2022-06-03: Legacy endpoints
| Shutdown date | System | Recommended replacement |
| ------------- | --------------------- | ----------------------------------------------------------------------------------------------------- |
| 2022-12-03 | `/v1/engines` | [/v1/models](https://platform.openai.com/docs/api-reference/models/list) |
| 2022-12-03 | `/v1/search` | [View transition guide](https://help.openai.com/en/articles/6272952-search-transition-guide) |
| 2022-12-03 | `/v1/classifications` | [View transition guide](https://help.openai.com/en/articles/6272941-classifications-transition-guide) |
| 2022-12-03 | `/v1/answers` | [View transition guide](https://help.openai.com/en/articles/6233728-answers-transition-guide) |

@ -0,0 +1,510 @@
Embeddings
Learn how to turn text into numbers, unlocking use cases like search.
<Notice
className="mt-2 mb-2"
icon={false}
color={NoticeColor.primary}
body={
New embedding models
text-embedding-3-small and
text-embedding-3-large
, our newest and most performant embedding models are now available, with
lower costs, higher multilingual performance, and new parameters to control the
overall size.
}
textSize={NoticeTextSize.large}
dismissable={false}
/>
## What are embeddings?
OpenAIs text embeddings measure the relatedness of text strings. Embeddings are commonly used for:
- **Search** (where results are ranked by relevance to a query string)
- **Clustering** (where text strings are grouped by similarity)
- **Recommendations** (where items with related text strings are recommended)
- **Anomaly detection** (where outliers with little relatedness are identified)
- **Diversity measurement** (where similarity distributions are analyzed)
- **Classification** (where text strings are classified by their most similar label)
An embedding is a vector (list) of floating point numbers. The [distance](/docs/guides/embeddings/which-distance-function-should-i-use) between two vectors measures their relatedness. Small distances suggest high relatedness and large distances suggest low relatedness.
Visit our [pricing page](https://openai.com/api/pricing/) to learn about Embeddings pricing. Requests are billed based on the number of [tokens](/tokenizer) in the [input](/docs/api-reference/embeddings/create#embeddings/create-input).
## How to get embeddings
To get an embedding, send your text string to the [embeddings API endpoint](/docs/api-reference/embeddings) along with the embedding model name (e.g. `text-embedding-3-small`). The response will contain an embedding (list of floating point numbers), which you can extract, save in a vector database, and use for many different use cases:
<CodeSample
title="Example: Getting embeddings"
defaultLanguage="curl"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.embeddings.create(
input="Your text string goes here",
model="text-embedding-3-small"
)\n
print(response.data[0].embedding)
`.trim(),
curl: `
curl https://api.openai.com/v1/embeddings \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"input": "Your text string goes here",
"model": "text-embedding-3-small"
}'
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const embedding = await openai.embeddings.create({
model: "text-embedding-3-small",
input: "Your text string goes here",
encoding_format: "float",
});\n
console.log(embedding);
}\n
main();
`.trim(),
}}
/>
The response will contain the embedding vector along with some additional metadata.
<CodeSample
title="Example embedding response"
defaultLanguage="json"
code={{
json: `
{
"object": "list",
"data": [
{
"object": "embedding",
"index": 0,
"embedding": [
-0.006929283495992422,
-0.005336422007530928,
... (omitted for spacing)
-4.547132266452536e-05,
-0.024047505110502243
],
}
],
"model": "text-embedding-3-small",
"usage": {
"prompt_tokens": 5,
"total_tokens": 5
}
}
`.trim(),
}}
/>
By default, the length of the embedding vector will be 1536 for `text-embedding-3-small` or 3072 for `text-embedding-3-large`. You can reduce the dimensions of the embedding by passing in the [dimensions parameter](/docs/api-reference/embeddings/create#embeddings-create-dimensions) without the embedding losing its concept-representing properties. We go into more detail on embedding dimensions in the [embedding use case section](/docs/guides/embeddings/use-cases).
## Embedding models
OpenAI offers two powerful third-generation embedding model (denoted by `-3` in the model ID). You can read the embedding v3 [announcement blog post](https://openai.com/blog/new-embedding-models-and-api-updates) for more details.
Usage is priced per input token, below is an example of pricing pages of text per US dollar (assuming ~800 tokens per page):
| Model | ~ Pages per dollar | Performance on [MTEB](https://github.com/embeddings-benchmark/mteb) eval | Max input |
| ---------------------- | ------------------ | ------------------------------------------------------------------------ | --------- |
| text-embedding-3-small | 62,500 | 62.3% | 8191 |
| text-embedding-3-large | 9,615 | 64.6% | 8191 |
| text-embedding-ada-002 | 12,500 | 61.0% | 8191 |
## Use cases
Here we show some representative use cases. We will use the [Amazon fine-food reviews dataset](https://www.kaggle.com/snap/amazon-fine-food-reviews) for the following examples.
### Obtaining the embeddings
The dataset contains a total of 568,454 food reviews Amazon users left up to October 2012. We will use a subset of 1,000 most recent reviews for illustration purposes. The reviews are in English and tend to be positive or negative. Each review has a ProductId, UserId, Score, review title (Summary) and review body (Text). For example:
| Product Id | User Id | Score | Summary | Text |
| ---------- | -------------- | ----- | --------------------- | ------------------------------------------------- |
| B001E4KFG0 | A3SGXH7AUHU8GW | 5 | Good Quality Dog Food | I have bought several of the Vitality canned... |
| B00813GRG4 | A1D87F6ZCVE5NK | 1 | Not as Advertised | Product arrived labeled as Jumbo Salted Peanut... |
We will combine the review summary and review text into a single combined text. The model will encode this combined text and output a single vector embedding.
Get_embeddings_from_dataset.ipynb
```python
from openai import OpenAI
client = OpenAI()
def get_embedding(text, model="text-embedding-3-small"):
text = text.replace("\n", " ")
return client.embeddings.create(input = [text], model=model).data[0].embedding
df['ada_embedding'] = df.combined.apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df.to_csv('output/embedded_1k_reviews.csv', index=False)
```
To load the data from a saved file, you can run the following:
```python
import pandas as pd
df = pd.read_csv('output/embedded_1k_reviews.csv')
df['ada_embedding'] = df.ada_embedding.apply(eval).apply(np.array)
```
Using larger embeddings, for example storing them in a vector store for retrieval, generally costs more and consumes more compute, memory and storage than using smaller embeddings.
Both of our new embedding models were trained [with a technique](https://arxiv.org/abs/2205.13147) that allows developers to trade-off performance and cost of using embeddings. Specifically, developers can shorten embeddings (i.e. remove some numbers from the end of the sequence) without the embedding losing its concept-representing properties by passing in the [`dimensions` API parameter](/docs/api-reference/embeddings/create#embeddings-create-dimensions). For example, on the MTEB benchmark, a `text-embedding-3-large` embedding can be shortened to a size of 256 while still outperforming an unshortened `text-embedding-ada-002` embedding with a size of 1536. You can read more about how changing the dimensions impacts performance in our [embeddings v3 launch blog post](https://openai.com/blog/new-embedding-models-and-api-updates#:~:text=Native%20support%20for%20shortening%20embeddings).
In general, using the `dimensions` parameter when creating the embedding is the suggested approach. In certain cases, you may need to change the embedding dimension after you generate it. When you change the dimension manually, you need to be sure to normalize the dimensions of the embedding as is shown below.
```python
from openai import OpenAI
import numpy as np
client = OpenAI()
def normalize_l2(x):
x = np.array(x)
if x.ndim == 1:
norm = np.linalg.norm(x)
if norm == 0:
return x
return x / norm
else:
norm = np.linalg.norm(x, 2, axis=1, keepdims=True)
return np.where(norm == 0, x, x / norm)
response = client.embeddings.create(
model="text-embedding-3-small", input="Testing 123", encoding_format="float"
)
cut_dim = response.data[0].embedding[:256]
norm_dim = normalize_l2(cut_dim)
print(norm_dim)
```
Dynamically changing the dimensions enables very flexible usage. For example, when using a vector data store that only supports embeddings up to 1024 dimensions long, developers can now still use our best embedding model `text-embedding-3-large` and specify a value of 1024 for the `dimensions` API parameter, which will shorten the embedding down from 3072 dimensions, trading off some accuracy in exchange for the smaller vector size.
Question_answering_using_embeddings.ipynb
There are many common cases where the model is not trained on data which contains key facts and information you want to make accessible when generating responses to a user query. One way of solving this, as shown below, is to put additional information into the context window of the model. This is effective in many use cases but leads to higher token costs. In this notebook, we explore the tradeoff between this approach and embeddings bases search.
```python
query = f"""Use the below article on the 2022 Winter Olympics to answer the subsequent question. If the answer cannot be found, write "I don't know."
Article:
\"\"\"
{wikipedia_article_on_curling}
\"\"\"
Question: Which athletes won the gold medal in curling at the 2022 Winter Olympics?"""
response = client.chat.completions.create(
messages=[
{'role': 'system', 'content': 'You answer questions about the 2022 Winter Olympics.'},
{'role': 'user', 'content': query},
],
model=GPT_MODEL,
temperature=0,
)
print(response.choices[0].message.content)
```
Semantic_text_search_using_embeddings.ipynb
To retrieve the most relevant documents we use the cosine similarity between the embedding vectors of the query and each document, and return the highest scored documents.
```python
from openai.embeddings_utils import get_embedding, cosine_similarity
def search_reviews(df, product_description, n=3, pprint=True):
embedding = get_embedding(product_description, model='text-embedding-3-small')
df['similarities'] = df.ada_embedding.apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
return res
res = search_reviews(df, 'delicious beans', n=3)
```
Code_search.ipynb
Code search works similarly to embedding-based text search. We provide a method to extract Python functions from all the Python files in a given repository. Each function is then indexed by the `text-embedding-3-small` model.
To perform a code search, we embed the query in natural language using the same model. Then we calculate cosine similarity between the resulting query embedding and each of the function embeddings. The highest cosine similarity results are most relevant.
```python
from openai.embeddings_utils import get_embedding, cosine_similarity
df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
def search_functions(df, code_query, n=3, pprint=True, n_lines=7):
embedding = get_embedding(code_query, model='text-embedding-3-small')
df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
res = df.sort_values('similarities', ascending=False).head(n)
return res
res = search_functions(df, 'Completions API tests', n=3)
```
Recommendation_using_embeddings.ipynb
Because shorter distances between embedding vectors represent greater similarity, embeddings can be useful for recommendation.
Below, we illustrate a basic recommender. It takes in a list of strings and one 'source' string, computes their embeddings, and then returns a ranking of the strings, ranked from most similar to least similar. As a concrete example, the linked notebook below applies a version of this function to the [AG news dataset](http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html) (sampled down to 2,000 news article descriptions) to return the top 5 most similar articles to any given source article.
```python
def recommendations_from_strings(
strings: List[str],
index_of_source_string: int,
model="text-embedding-3-small",
) -> List[int]:
"""Return nearest neighbors of a given string."""
# get embeddings for all strings
embeddings = [embedding_from_string(string, model=model) for string in strings]
# get the embedding of the source string
query_embedding = embeddings[index_of_source_string]
# get distances between the source embedding and other embeddings (function from embeddings_utils.py)
distances = distances_from_embeddings(query_embedding, embeddings, distance_metric="cosine")
# get indices of nearest neighbors (function from embeddings_utils.py)
indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
return indices_of_nearest_neighbors
```
Visualizing_embeddings_in_2D.ipynb
The size of the embeddings varies with the complexity of the underlying model. In order to visualize this high dimensional data we use the t-SNE algorithm to transform the data into two dimensions.
We color the individual reviews based on the star rating which the reviewer has given:
- 1-star: red
- 2-star: dark orange
- 3-star: gold
- 4-star: turquoise
- 5-star: dark green
The visualization seems to have produced roughly 3 clusters, one of which has mostly negative reviews.
```python
import pandas as pd
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib
df = pd.read_csv('output/embedded_1k_reviews.csv')
matrix = df.ada_embedding.apply(eval).to_list()
# Create a t-SNE model and transform the data
tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)
colors = ["red", "darkorange", "gold", "turquiose", "darkgreen"]
x = [x for x,y in vis_dims]
y = [y for x,y in vis_dims]
color_indices = df.Score.values - 1
colormap = matplotlib.colors.ListedColormap(colors)
plt.scatter(x, y, c=color_indices, cmap=colormap, alpha=0.3)
plt.title("Amazon ratings visualized in language using t-SNE")
```
Regression_using_embeddings.ipynb
An embedding can be used as a general free-text feature encoder within a machine learning model. Incorporating embeddings will improve the performance of any machine learning model, if some of the relevant inputs are free text. An embedding can also be used as a categorical feature encoder within a ML model. This adds most value if the names of categorical variables are meaningful and numerous, such as job titles. Similarity embeddings generally perform better than search embeddings for this task.
We observed that generally the embedding representation is very rich and information dense. For example, reducing the dimensionality of the inputs using SVD or PCA, even by 10%, generally results in worse downstream performance on specific tasks.
This code splits the data into a training set and a testing set, which will be used by the following two use cases, namely regression and classification.
```python
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
list(df.ada_embedding.values),
df.Score,
test_size = 0.2,
random_state=42
)
```
#### Regression using the embedding features
Embeddings present an elegant way of predicting a numerical value. In this example we predict the reviewers star rating, based on the text of their review. Because the semantic information contained within embeddings is high, the prediction is decent even with very few reviews.
We assume the score is a continuous variable between 1 and 5, and allow the algorithm to predict any floating point value. The ML algorithm minimizes the distance of the predicted value to the true score, and achieves a mean absolute error of 0.39, which means that on average the prediction is off by less than half a star.
```python
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(n_estimators=100)
rfr.fit(X_train, y_train)
preds = rfr.predict(X_test)
```
Classification_using_embeddings.ipynb
This time, instead of having the algorithm predict a value anywhere between 1 and 5, we will attempt to classify the exact number of stars for a review into 5 buckets, ranging from 1 to 5 stars.
After the training, the model learns to predict 1 and 5-star reviews much better than the more nuanced reviews (2-4 stars), likely due to more extreme sentiment expression.
```python
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
```
Zero-shot_classification_with_embeddings.ipynb
We can use embeddings for zero shot classification without any labeled training data. For each class, we embed the class name or a short description of the class. To classify some new text in a zero-shot manner, we compare its embedding to all class embeddings and predict the class with the highest similarity.
```python
from openai.embeddings_utils import cosine_similarity, get_embedding
df= df[df.Score!=3]
df['sentiment'] = df.Score.replace({1:'negative', 2:'negative', 4:'positive', 5:'positive'})
labels = ['negative', 'positive']
label_embeddings = [get_embedding(label, model=model) for label in labels]
def label_score(review_embedding, label_embeddings):
return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])
prediction = 'positive' if label_score('Sample Review', label_embeddings) > 0 else 'negative'
```
User_and_product_embeddings.ipynb
We can obtain a user embedding by averaging over all of their reviews. Similarly, we can obtain a product embedding by averaging over all the reviews about that product. In order to showcase the usefulness of this approach we use a subset of 50k reviews to cover more reviews per user and per product.
We evaluate the usefulness of these embeddings on a separate test set, where we plot similarity of the user and product embedding as a function of the rating. Interestingly, based on this approach, even before the user receives the product we can predict better than random whether they would like the product.
```python
user_embeddings = df.groupby('UserId').ada_embedding.apply(np.mean)
prod_embeddings = df.groupby('ProductId').ada_embedding.apply(np.mean)
```
Clustering.ipynb
Clustering is one way of making sense of a large volume of textual data. Embeddings are useful for this task, as they provide semantically meaningful vector representations of each text. Thus, in an unsupervised way, clustering will uncover hidden groupings in our dataset.
In this example, we discover four distinct clusters: one focusing on dog food, one on negative reviews, and two on positive reviews.
```python
import numpy as np
from sklearn.cluster import KMeans
matrix = np.vstack(df.ada_embedding.values)
n_clusters = 4
kmeans = KMeans(n_clusters = n_clusters, init='k-means++', random_state=42)
kmeans.fit(matrix)
df['Cluster'] = kmeans.labels_
```
## Frequently asked questions
### How can I tell how many tokens a string has before I embed it?
In Python, you can split a string into tokens with OpenAI's tokenizer [`tiktoken`](https://github.com/openai/tiktoken).
Example code:
```python
import tiktoken
def num_tokens_from_string(string: str, encoding_name: str) -> int:
"""Returns the number of tokens in a text string."""
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens
num_tokens_from_string("tiktoken is great!", "cl100k_base")
```
For third-generation embedding models like `text-embedding-3-small`, use the `cl100k_base` encoding.
More details and example code are in the OpenAI Cookbook guide [how to count tokens with tiktoken](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken).
### How can I retrieve K nearest embedding vectors quickly?
For searching over many vectors quickly, we recommend using a vector database. You can find examples of working with vector databases and the OpenAI API [in our Cookbook](https://cookbook.openai.com/examples/vector_databases/readme) on GitHub.
### Which distance function should I use?
We recommend [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). The choice of distance function typically doesnt matter much.
OpenAI embeddings are normalized to length 1, which means that:
- Cosine similarity can be computed slightly faster using just a dot product
- Cosine similarity and Euclidean distance will result in the identical rankings
### Can I share my embeddings online?
Yes, customers own their input and output from our models, including in the case of embeddings. You are responsible for ensuring that the content you input to our API does not violate any applicable law or our [Terms of Use](https://openai.com/policies/terms-of-use).
### Do V3 embedding models know about recent events?
No, the `text-embedding-3-large` and `text-embedding-3-small` models lack knowledge of events that occurred after September 2021. This is generally not as much of a limitation as it would be for text generation models but in certain edge cases it can reduce performance.

@ -0,0 +1,244 @@
Error codes
This guide includes an overview on error codes you might see from both the [API](/docs/introduction) and our [official Python library](/docs/libraries/python-library). Each error code mentioned in the overview has a dedicated section with further guidance.
## API errors
| Code | Overview |
| --------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| 401 - Invalid Authentication | **Cause:** Invalid Authentication **Solution:** Ensure the correct [API key](/account/api-keys) and requesting organization are being used. |
| 401 - Incorrect API key provided | **Cause:** The requesting API key is not correct. **Solution:** Ensure the API key used is correct, clear your browser cache, or [generate a new one](/account/api-keys). |
| 401 - You must be a member of an organization to use the API | **Cause:** Your account is not part of an organization. **Solution:** Contact us to get added to a new organization or ask your organization manager to [invite you to an organization](/account/team). |
| 403 - Country, region, or territory not supported | **Cause:** You are accessing the API from an unsupported country, region, or territory. **Solution:** Please see [this page](/docs/supported-countries) for more information. |
| 429 - Rate limit reached for requests | **Cause:** You are sending requests too quickly. **Solution:** Pace your requests. Read the [Rate limit guide](/docs/guides/rate-limits). |
| 429 - You exceeded your current quota, please check your plan and billing details | **Cause:** You have run out of credits or hit your maximum monthly spend. **Solution:** [Buy more credits](/account/billing) or learn how to [increase your limits](/account/limits). |
| 500 - The server had an error while processing your request | **Cause:** Issue on our servers. **Solution:** Retry your request after a brief wait and contact us if the issue persists. Check the [status page](https://status.openai.com/). |
| 503 - The engine is currently overloaded, please try again later | **Cause:** Our servers are experiencing high traffic. **Solution:** Please retry your requests after a brief wait. |
This error message indicates that your authentication credentials are invalid. This could happen for several reasons, such as:
- You are using a revoked API key.
- You are using a different API key than the one assigned to the requesting organization or project.
- You are using an API key that does not have the required permissions for the endpoint you are calling.
To resolve this error, please follow these steps:
- Check that you are using the correct API key and organization ID in your request header. You can find your API key and organization ID in [your account settings](/account/api-keys) or your can find specific project related keys under [General settings](/settings/organization/general) by selecting the desired project.
- If you are unsure whether your API key is valid, you can [generate a new one](/account/api-keys). Make sure to replace your old API key with the new one in your requests and follow our [best practices guide](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
This error message indicates that the API key you are using in your request is not correct. This could happen for several reasons, such as:
- There is a typo or an extra space in your API key.
- You are using an API key that belongs to a different organization or project.
- You are using an API key that has been deleted or deactivated.
- An old, revoked API key might be cached locally.
To resolve this error, please follow these steps:
- Try clearing your browser's cache and cookies, then try again.
- Check that you are using the correct API key in your request header.
- If you are unsure whether your API key is correct, you can [generate a new one](/account/api-keys). Make sure to replace your old API key in your codebase and follow our [best practices guide](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
This error message indicates that your account is not part of an organization. This could happen for several reasons, such as:
- You have left or been removed from your previous organization.
- You have left or been removed from your previous project.
- Your organization has been deleted.
To resolve this error, please follow these steps:
- If you have left or been removed from your previous organization, you can either request a new organization or get invited to an existing one.
- To request a new organization, reach out to us via help.openai.com
- Existing organization owners can invite you to join their organization via the [Team page](/account/team) or can create a new project from the [Settings page](settings/organization/general)
- If you have left or been removed from a previous project, you can ask your organization or project owner to add you to it, or create a new one.
This error message indicates that you have hit your assigned rate limit for the API. This means that you have submitted too many tokens or requests in a short period of time and have exceeded the number of requests allowed. This could happen for several reasons, such as:
- You are using a loop or a script that makes frequent or concurrent requests.
- You are sharing your API key with other users or applications.
- You are using a free plan that has a low rate limit.
- You have reached the defined limit on your project
To resolve this error, please follow these steps:
- Pace your requests and avoid making unnecessary or redundant calls.
- If you are using a loop or a script, make sure to implement a backoff mechanism or a retry logic that respects the rate limit and the response headers. You can read more about our rate limiting policy and best practices in our [rate limit guide](/docs/guides/rate-limits).
- If you are sharing your organization with other users, note that limits are applied per organization and not per user. It is worth checking on the usage of the rest of your team as this will contribute to the limit.
- If you are using a free or low-tier plan, consider upgrading to a pay-as-you-go plan that offers a higher rate limit. You can compare the restrictions of each plan in our [rate limit guide](/docs/guides/rate-limits).
- Reach out to your organization owner to increase the rate limits on your project
This error message indicates that you hit your monthly [usage limit](/account/limits) for the API, or for prepaid credits customers that you've consumed all your credits. You can view your maximum usage limit on the [limits page](/account/limits). This could happen for several reasons, such as:
- You are using a high-volume or complex service that consumes a lot of credits or tokens.
- Your monthly budget is set too low for your organizations usage.
- Your monthly budget is set too low for your project's usage.
To resolve this error, please follow these steps:
- Check your [current usage](/account/usage) of your account, and compare that to your account's [limits](/account/limits).
- If you are on a free plan, consider [upgrading to a paid plan](/account/billing) to get higher limits.
- Reach out to your organization owner to increase the budgets for your project.
This error message indicates that our servers are experiencing high traffic and are unable to process your request at the moment. This could happen for several reasons, such as:
- There is a sudden spike or surge in demand for our services.
- There is scheduled or unscheduled maintenance or update on our servers.
- There is an unexpected or unavoidable outage or incident on our servers.
To resolve this error, please follow these steps:
- Retry your request after a brief wait. We recommend using an exponential backoff strategy or a retry logic that respects the response headers and the rate limit. You can read more about our rate limit [best practices](https://help.openai.com/en/articles/6891753-rate-limit-advice).
- Check our [status page](https://status.openai.com/) for any updates or announcements regarding our services and servers.
- If you are still getting this error after a reasonable amount of time, please contact us for further assistance. We apologize for any inconvenience and appreciate your patience and understanding.
## Python library error types
| Type | Overview |
| ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| APIConnectionError | **Cause:** Issue connecting to our services. **Solution:** Check your network settings, proxy configuration, SSL certificates, or firewall rules. |
| APITimeoutError | **Cause:** Request timed out. **Solution:** Retry your request after a brief wait and contact us if the issue persists. |
| AuthenticationError | **Cause:** Your API key or token was invalid, expired, or revoked. **Solution:** Check your API key or token and make sure it is correct and active. You may need to generate a new one from your account dashboard. |
| BadRequestError | **Cause:** Your request was malformed or missing some required parameters, such as a token or an input. **Solution:** The error message should advise you on the specific error made. Check the [documentation](/docs/api-reference/) for the specific API method you are calling and make sure you are sending valid and complete parameters. You may also need to check the encoding, format, or size of your request data. |
| ConflictError | **Cause:** The resource was updated by another request. **Solution:** Try to update the resource again and ensure no other requests are trying to update it. |
| InternalServerError | **Cause:** Issue on our side. **Solution:** Retry your request after a brief wait and contact us if the issue persists. |
| NotFoundError | **Cause:** Requested resource does not exist. **Solution:** Ensure you are the correct resource identifier. |
| PermissionDeniedError | **Cause:** You don't have access to the requested resource. **Solution:** Ensure you are using the correct API key, organization ID, and resource ID. |
| RateLimitError | **Cause:** You have hit your assigned rate limit. **Solution:** Pace your requests. Read more in our [Rate limit guide](/docs/guides/rate-limits). |
| UnprocessableEntityError | **Cause:** Unable to process the request despite the format being correct. **Solution:** Please try the request again. |
An `APIConnectionError` indicates that your request could not reach our servers or establish a secure connection. This could be due to a network issue, a proxy configuration, an SSL certificate, or a firewall rule.
If you encounter an `APIConnectionError`, please try the following steps:
- Check your network settings and make sure you have a stable and fast internet connection. You may need to switch to a different network, use a wired connection, or reduce the number of devices or applications using your bandwidth.
- Check your proxy configuration and make sure it is compatible with our services. You may need to update your proxy settings, use a different proxy, or bypass the proxy altogether.
- Check your SSL certificates and make sure they are valid and up-to-date. You may need to install or renew your certificates, use a different certificate authority, or disable SSL verification.
- Check your firewall rules and make sure they are not blocking or filtering our services. You may need to modify your firewall settings.
- If appropriate, check that your container has the correct permissions to send and receive traffic.
- If the issue persists, check out our persistent errors next steps section.
A `APITimeoutError` error indicates that your request took too long to complete and our server closed the connection. This could be due to a network issue, a heavy load on our services, or a complex request that requires more processing time.
If you encounter a `APITimeoutError` error, please try the following steps:
- Wait a few seconds and retry your request. Sometimes, the network congestion or the load on our services may be reduced and your request may succeed on the second attempt.
- Check your network settings and make sure you have a stable and fast internet connection. You may need to switch to a different network, use a wired connection, or reduce the number of devices or applications using your bandwidth.
- If the issue persists, check out our persistent errors next steps section.
An `AuthenticationError` indicates that your API key or token was invalid, expired, or revoked. This could be due to a typo, a formatting error, or a security breach.
If you encounter an `AuthenticationError`, please try the following steps:
- Check your API key or token and make sure it is correct and active. You may need to generate a new key from the API Key dashboard, ensure there are no extra spaces or characters, or use a different key or token if you have multiple ones.
- Ensure that you have followed the correct formatting.
An `BadRequestError` (formerly `InvalidRequestError`) indicates that your request was malformed or missing some required parameters, such as a token or an input. This could be due to a typo, a formatting error, or a logic error in your code.
If you encounter an `BadRequestError`, please try the following steps:
- Read the error message carefully and identify the specific error made. The error message should advise you on what parameter was invalid or missing, and what value or format was expected.
- Check the [API Reference](/docs/api-reference/) for the specific API method you were calling and make sure you are sending valid and complete parameters. You may need to review the parameter names, types, values, and formats, and ensure they match the documentation.
- Check the encoding, format, or size of your request data and make sure they are compatible with our services. You may need to encode your data in UTF-8, format your data in JSON, or compress your data if it is too large.
- Test your request using a tool like Postman or curl and make sure it works as expected. You may need to debug your code and fix any errors or inconsistencies in your request logic.
- If the issue persists, check out our persistent errors next steps section.
An `InternalServerError` indicates that something went wrong on our side when processing your request. This could be due to a temporary error, a bug, or a system outage.
We apologize for any inconvenience and we are working hard to resolve any issues as soon as possible. You can [check our system status page](https://status.openai.com/) for more information.
If you encounter an `InternalServerError`, please try the following steps:
- Wait a few seconds and retry your request. Sometimes, the issue may be resolved quickly and your request may succeed on the second attempt.
- Check our status page for any ongoing incidents or maintenance that may affect our services. If there is an active incident, please follow the updates and wait until it is resolved before retrying your request.
- If the issue persists, check out our Persistent errors next steps section.
Our support team will investigate the issue and get back to you as soon as possible. Note that our support queue times may be long due to high demand. You can also [post in our Community Forum](https://community.openai.com) but be sure to omit any sensitive information.
A `RateLimitError` indicates that you have hit your assigned rate limit. This means that you have sent too many tokens or requests in a given period of time, and our services have temporarily blocked you from sending more.
We impose rate limits to ensure fair and efficient use of our resources and to prevent abuse or overload of our services.
If you encounter a `RateLimitError`, please try the following steps:
- Send fewer tokens or requests or slow down. You may need to reduce the frequency or volume of your requests, batch your tokens, or implement exponential backoff. You can read our [Rate limit guide](/docs/guides/rate-limits) for more details.
- Wait until your rate limit resets (one minute) and retry your request. The error message should give you a sense of your usage rate and permitted usage.
- You can also check your API usage statistics from your account dashboard.
### Persistent errors
If the issue persists, [contact our support team via chat](https://help.openai.com/en/) and provide them with the following information:
- The model you were using
- The error message and code you received
- The request data and headers you sent
- The timestamp and timezone of your request
- Any other relevant details that may help us diagnose the issue
Our support team will investigate the issue and get back to you as soon as possible. Note that our support queue times may be long due to high demand. You can also [post in our Community Forum](https://community.openai.com) but be sure to omit any sensitive information.
### Handling errors
We advise you to programmatically handle errors returned by the API. To do so, you may want to use a code snippet like below:
```python
import openai
from openai import OpenAI
client = OpenAI()
try:
#Make your OpenAI API request here
response = client.completions.create(
prompt="Hello world",
model="gpt-3.5-turbo-instruct"
)
except openai.APIError as e:
#Handle API error here, e.g. retry or log
print(f"OpenAI API returned an API Error: {e}")
pass
except openai.APIConnectionError as e:
#Handle connection error here
print(f"Failed to connect to OpenAI API: {e}")
pass
except openai.RateLimitError as e:
#Handle rate limit error (we recommend using exponential backoff)
print(f"OpenAI API request exceeded rate limit: {e}")
pass
```

@ -0,0 +1,827 @@
# Fine-tuning
Learn how to customize a model for your application.
## Introduction
Fine-tuning lets you get more out of the models available through the API by providing:
- Higher quality results than prompting
- Ability to train on more examples than can fit in a prompt
- Token savings due to shorter prompts
- Lower latency requests
OpenAI's text generation models have been pre-trained on a vast amount of text. To use the models effectively, we include instructions and sometimes several examples in a prompt. Using demonstrations to show how to perform a task is often called "few-shot learning."
Fine-tuning improves on few-shot learning by training on many more examples than can fit in the prompt, letting you achieve better results on a wide number of tasks. **Once a model has been fine-tuned, you won't need to provide as many examples in the prompt.** This saves costs and enables lower-latency requests.
At a high level, fine-tuning involves the following steps:
1. Prepare and upload training data
2. Train a new fine-tuned model
3. Evaluate results and go back to step 1 if needed
4. Use your fine-tuned model
Visit our [pricing page](https://openai.com/api/pricing) to learn more about how fine-tuned model training and usage are billed.
### Which models can be fine-tuned?
Fine-tuning for GPT-4 (`gpt-4-0613` and `gpt-4o-*`) is in an experimental access
program - eligible users can request access in the{" "}
fine-tuning UI when creating a new fine-tuning job.
Fine-tuning is currently available for the following models: `gpt-3.5-turbo-0125` (recommended), `gpt-3.5-turbo-1106`, `gpt-3.5-turbo-0613`, `babbage-002`, `davinci-002`, `gpt-4-0613` (experimental), and `gpt-4o-2024-05-13`.
You can also fine-tune a fine-tuned model which is useful if you acquire additional data and don't want to repeat the previous training steps.
We expect `gpt-3.5-turbo` to be the right model for most users in terms of results and ease of use.
## When to use fine-tuning
Fine-tuning OpenAI text generation models can make them better for specific applications, but it requires a careful investment of time and effort. We recommend first attempting to get good results with prompt engineering, prompt chaining (breaking complex tasks into multiple prompts), and [function calling](/docs/guides/function-calling), with the key reasons being:
- There are many tasks at which our models may not initially appear to perform well, but results can be improved with the right prompts - thus fine-tuning may not be necessary
- Iterating over prompts and other tactics has a much faster feedback loop than iterating with fine-tuning, which requires creating datasets and running training jobs
- In cases where fine-tuning is still necessary, initial prompt engineering work is not wasted - we typically see best results when using a good prompt in the fine-tuning data (or combining prompt chaining / tool use with fine-tuning)
Our [prompt engineering guide](/docs/guides/prompt-engineering) provides a background on some of the most effective strategies and tactics for getting better performance without fine-tuning. You may find it helpful to iterate quickly on prompts in our [playground](/playground).
### Common use cases
Some common use cases where fine-tuning can improve results:
- Setting the style, tone, format, or other qualitative aspects
- Improving reliability at producing a desired output
- Correcting failures to follow complex prompts
- Handling many edge cases in specific ways
- Performing a new skill or task thats hard to articulate in a prompt
One high-level way to think about these cases is when its easier to "show, not tell". In the sections to come, we will explore how to set up data for fine-tuning and various examples where fine-tuning improves the performance over the baseline model.
Another scenario where fine-tuning is effective is in reducing costs and / or latency, by replacing GPT-4 or by utilizing shorter prompts, without sacrificing quality. If you can achieve good results with GPT-4, you can often reach similar quality with a fine-tuned `gpt-3.5-turbo` model by fine-tuning on the GPT-4 completions, possibly with a shortened instruction prompt.
## Preparing your dataset
Once you have determined that fine-tuning is the right solution (i.e. youve optimized your prompt as far as it can take you and identified problems that the model still has), youll need to prepare data for training the model. You should create a diverse set of demonstration conversations that are similar to the conversations you will ask the model to respond to at inference time in production.
Each example in the dataset should be a conversation in the same format as our [Chat Completions API](/docs/api-reference/chat/create), specifically a list of messages where each message has a role, content, and [optional name](/docs/api-reference/chat/create#chat/create-chat/create-messages-name). At least some of the training examples should directly target cases where the prompted model is not behaving as desired, and the provided assistant messages in the data should be the ideal responses you want the model to provide.
### Example format
In this example, our goal is to create a chatbot that occasionally gives sarcastic responses, these are three training examples (conversations) we could create for a dataset:
```jsonl
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
```
The conversational chat format is required to fine-tune `gpt-3.5-turbo`. For `babbage-002` and `davinci-002`, you can follow the prompt completion pair format as shown below.
```json
{"prompt": "", "completion": ""}
{"prompt": "", "completion": ""}
{"prompt": "", "completion": ""}
```
### Multi-turn chat examples
Examples in the chat format can have multiple messages with the assistant role. The default behavior during fine-tuning is to train on all assistant messages within a single example. To skip fine-tuning on specific assistant messages, a `weight` key can be added disable fine-tuning on that message, allowing you to control which assistant messages are learned. The allowed values for `weight` are currently 0 or 1. Some examples using `weight` for the chat format are below.
```jsonl
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris", "weight": 0}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already.", "weight": 1}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "William Shakespeare", "weight": 0}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?", "weight": 1}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "384,400 kilometers", "weight": 0}, {"role": "user", "content": "Can you be more sarcastic?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters.", "weight": 1}]}
```
### Crafting prompts
We generally recommend taking the set of instructions and prompts that you found worked best for the model prior to fine-tuning, and including them in every training example. This should let you reach the best and most general results, especially if you have relatively few (e.g. under a hundred) training examples.
If you would like to shorten the instructions or prompts that are repeated in every example to save costs, keep in mind that the model will likely behave as if those instructions were included, and it may be hard to get the model to ignore those "baked-in" instructions at inference time.
It may take more training examples to arrive at good results, as the model has to learn entirely through demonstration and without guided instructions.
### Example count recommendations
To fine-tune a model, you are required to provide at least 10 examples. We typically see clear improvements from fine-tuning on 50 to 100 training examples with `gpt-3.5-turbo` but the right number varies greatly based on the exact use case.
We recommend starting with 50 well-crafted demonstrations and seeing if the model shows signs of improvement after fine-tuning. In some cases that may be sufficient, but even if the model is not yet production quality, clear improvements are a good sign that providing more data will continue to improve the model. No improvement suggests that you may need to rethink how to set up the task for the model or restructure the data before scaling beyond a limited example set.
### Train and test splits
After collecting the initial dataset, we recommend splitting it into a training and test portion. When submitting a fine-tuning job with both training and test files, we will provide statistics on both during the course of training. These statistics will be your initial signal of how much the model is improving. Additionally, constructing a test set early on will be useful in making sure you are able to evaluate the model after training, by generating samples on the test set.
### Token limits
Token limits depend on the model you select. For `gpt-3.5-turbo-0125`, the maximum context length is 16,385 so each training example is also limited to 16,385 tokens. For `gpt-3.5-turbo-0613`, each training example is limited to 4,096 tokens. Examples longer than the default will be truncated to the maximum context length which removes tokens from the end of the training example(s). To be sure that your entire training example fits in context, consider checking that the total token counts in the message contents are under the limit.
You can compute token counts using our [counting tokens notebook](https://cookbook.openai.com/examples/How_to_count_tokens_with_tiktoken.ipynb) from the OpenAI cookbook.
### Estimate costs
For detailed pricing on training costs, as well as input and output costs for a deployed fine-tuned model, visit our [pricing page](https://openai.com/pricing). Note that we don't charge for tokens used for training validation. To estimate the cost of a specific fine-tuning training job, use the following formula:
> (base training cost per 1M input tokens ÷ 1M) × number of tokens in the input file × number of epochs trained
For a training file with 100,000 tokens trained over 3 epochs, the expected cost would be ~$2.40 USD with `gpt-3.5-turbo-0125`.
### Check data formatting
Once you have compiled a dataset and before you create a fine-tuning job, it is important to check the data formatting. To do this, we created a simple Python script which you can use to find potential errors, review token counts, and estimate the cost of a fine-tuning job.
<IconItem
icon={}
color="green"
title="Fine-tuning data format validation"
className="mt-6"
>
Learn about fine-tuning data formatting
### Upload a training file
Once you have the data validated, the file needs to be uploaded using the [Files API](/docs/api-reference/files/create) in order to be used with a fine-tuning jobs:
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
client.files.create(
file=open("mydata.jsonl", "rb"),
purpose="fine-tune"
)
`.trim(),
"node.js": `
import OpenAI, { toFile } from 'openai';\n
const openai = new OpenAI();\n
// If you have access to Node fs we recommend using fs.createReadStream():
await openai.files.create({ file: fs.createReadStream('mydata.jsonl'), purpose: 'fine-tune' });\n
// Or if you have the web File API you can pass a File instance:
await openai.files.create({ file: new File(['my bytes'], 'mydata.jsonl'), purpose: 'fine-tune' });\n
// You can also pass a fetch Response:
await openai.files.create({ file: await fetch('https://somesite/mydata.jsonl'), purpose: 'fine-tune' });
`.trim(),
curl: `
curl https://api.openai.com/v1/files \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F purpose="fine-tune" \\
-F file="@mydata.jsonl"
`.trim(),
}}
/>
After you upload the file, it may take some time to process. While the file is processing, you can still create a fine-tuning job but it will not start until the file processing has completed.
The maximum file upload size is 1 GB, though we do not suggest fine-tuning with that amount of data since you are unlikely to need that large of an amount to see improvements.
## Create a fine-tuned model
After ensuring you have the right amount and structure for your dataset, and have uploaded the file, the next step is to create a fine-tuning job. We support creating fine-tuning jobs via the [fine-tuning UI](/finetune) or programmatically.
To start a fine-tuning job using the OpenAI SDK:
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
client.fine_tuning.jobs.create(
training_file="file-abc123",
model="gpt-3.5-turbo"
)
`.trim(),
"node.js": `
const fineTune = await openai.fineTuning.jobs.create({ training_file: 'file-abc123', model: 'gpt-3.5-turbo' });
`.trim(),
}}
/>
In this example, `model` is the name of the model you want to fine-tune (`gpt-3.5-turbo`, `babbage-002`, `davinci-002`, or an existing fine-tuned model) and `training_file` is the file ID that was returned when the training file was uploaded to the OpenAI API. You can customize your fine-tuned model's name using the [suffix parameter](/docs/api-reference/fine-tuning/create#fine-tuning/create-suffix).
To set additional fine-tuning parameters like the `validation_file` or `hyperparameters`, please refer to the [API specification for fine-tuning](/docs/api-reference/fine-tuning/create).
After you've started a fine-tuning job, it may take some time to complete. Your job may be queued behind other jobs in our system, and training a model can take minutes or hours depending on the model and dataset size. After the model training is completed, the user who created the fine-tuning job will receive an email confirmation.
In addition to creating a fine-tuning job, you can also list existing jobs, retrieve the status of a job, or cancel a job.
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
# List 10 fine-tuning jobs
client.fine_tuning.jobs.list(limit=10)\n
# Retrieve the state of a fine-tune
client.fine_tuning.jobs.retrieve("ftjob-abc123")\n
# Cancel a job
client.fine_tuning.jobs.cancel("ftjob-abc123")\n
# List up to 10 events from a fine-tuning job
client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-abc123", limit=10)\n
# Delete a fine-tuned model (must be an owner of the org the model was created in)
client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:abc123")
`.trim(),
"node.js": `
// List 10 fine-tuning jobs
let page = await openai.fineTuning.jobs.list({ limit: 10 });\n
// Retrieve the state of a fine-tune
let fineTune = await openai.fineTuning.jobs.retrieve('ftjob-abc123');\n
// Cancel a job
let status = await openai.fineTuning.jobs.cancel('ftjob-abc123');\n
// List up to 10 events from a fine-tuning job
let events = await openai.fineTuning.jobs.listEvents(fineTune.id, { limit: 10 });\n
// Delete a fine-tuned model (must be an owner of the org the model was created in)
let model = await openai.models.delete('ft:gpt-3.5-turbo:acemeco:suffix:abc123');
`.trim(),
}}
/>
## Use a fine-tuned model
When a job has succeeded, you will see the `fine_tuned_model` field populated with the name of the model when you retrieve the job details. You may now specify this model as a parameter to in the [Chat Completions](/docs/api-reference/chat) (for `gpt-3.5-turbo`) or [legacy Completions](/docs/api-reference/completions) API (for `babbage-002` and `davinci-002`), and make requests to it using the [Playground](/playground).
After your job is completed, the model should be available right away for inference use. In some cases, it may take several minutes for your model to become ready to handle requests. If requests to your model time out or the model name cannot be found, it is likely because your model is still being loaded. If this happens, try again in a few minutes.
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
completion = client.chat.completions.create(
model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]
)
print(completion.choices[0].message)
`.trim(),
"node.js": `
async function main() {
const completion = await openai.chat.completions.create({
messages: [{ role: "system", content: "You are a helpful assistant." }],
model: "ft:gpt-3.5-turbo:my-org:custom_suffix:id",
});
console.log(completion.choices[0]);
}
main();
`.trim(),
}}
/>
You can start making requests by passing the model name as shown above and in our [GPT guide](/docs/guides/text-generation/chat-completions-api).
## Use a checkpointed model
In addition to creating a final fine-tuned model at the end of each fine-tuning job, OpenAI will create one full model checkpoint for you at the end of each training epoch. These checkpoints are themselves full models that can be used within our completions and chat-completions endpoints. Checkpoints are useful as they potentially provide a version of your fine-tuned model from before it experienced overfitting.
To access these checkpoints,
1. Wait until a job succeeds, which you can verify by [querying the status of a job.](/docs/api-reference/fine-tuning/retrieve)
2. [Query the checkpoints endpoint](/docs/api-reference/fine-tuning/list-checkpoints) with your fine-tuning job ID to access a list of model checkpoints for the fine-tuning job.
For each checkpoint object, you will see the `fine_tuned_model_checkpoint` field populated with the name of the model checkpoint. You may now use this model just like you would with the [final fine-tuned model](/docs/guides/fine-tuning/use-a-fine-tuned-model).
```json
{
"object": "fine_tuning.job.checkpoint",
"id": "ftckpt_zc4Q7MP6XxulcVzj4MZdwsAB",
"created_at": 1519129973,
"fine_tuned_model_checkpoint": "ft:gpt-3.5-turbo-0125:my-org:custom-suffix:96olL566:ckpt-step-2000",
"metrics": {
"full_valid_loss": 0.134,
"full_valid_mean_token_accuracy": 0.874
},
"fine_tuning_job_id": "ftjob-abc123",
"step_number": 2000
}
```
Each checkpoint will specify its:
- `step_number`: The step at which the checkpoint was created (where each epoch is number of steps in the training set divided by the batch size)
- `metrics`: an object containing the metrics for your fine-tuning job at the step when the checkpoint was created.
Currently, only the checkpoints for the last 3 epochs of the job are saved and available for use. We plan to release more complex and flexible checkpointing strategies in the near future.
## Analyzing your fine-tuned model
We provide the following training metrics computed over the course of training:
- training loss
- training token accuracy
- valid loss
- valid token accuracy
Valid loss and valid token accuracy are computed in two different ways - on a small batch of the data during each step, and on the full valid split at the end of each epoch. The full valid loss and full valid token accuracy metrics are the most accurate metric tracking the overall performance of your model. These statistics are meant to provide a sanity check that training went smoothly (loss should decrease, token accuracy should increase). While an active fine-tuning jobs is running, you can view an event object which contains some useful metrics:
```json
{
"object": "fine_tuning.job.event",
"id": "ftevent-abc-123",
"created_at": 1693582679,
"level": "info",
"message": "Step 300/300: training loss=0.15, validation loss=0.27, full validation loss=0.40",
"data": {
"step": 300,
"train_loss": 0.14991648495197296,
"valid_loss": 0.26569826706596045,
"total_steps": 300,
"full_valid_loss": 0.4032616495084362,
"train_mean_token_accuracy": 0.9444444179534912,
"valid_mean_token_accuracy": 0.9565217391304348,
"full_valid_mean_token_accuracy": 0.9089635854341737
},
"type": "metrics"
}
```
After a fine-tuning job has finished, you can also see metrics around how the training process went by [querying a fine-tuning job](/docs/api-reference/fine-tuning/retrieve), extracting a file ID from the `result_files`, and then [retrieving that files content](/docs/api-reference/files/retrieve-contents). Each results CSV file has the following columns: `step`, `train_loss`, `train_accuracy`, `valid_loss`, and `valid_mean_token_accuracy`.
```csv
step,train_loss,train_accuracy,valid_loss,valid_mean_token_accuracy
1,1.52347,0.0,,
2,0.57719,0.0,,
3,3.63525,0.0,,
4,1.72257,0.0,,
5,1.52379,0.0,,
```
While metrics can he helpful, evaluating samples from the fine-tuned model provides the most relevant sense of model quality. We recommend generating samples from both the base model and the fine-tuned model on a test set, and comparing the samples side by side. The test set should ideally include the full distribution of inputs that you might send to the model in a production use case. If manual evaluation is too time-consuming, consider using our [Evals library](https://github.com/openai/evals) to automate future evaluations.
### Iterating on data quality
If the results from a fine-tuning job are not as good as you expected, consider the following ways to adjust the training dataset:
- Collect examples to target remaining issues
- If the model still isnt good at certain aspects, add training examples that directly show the model how to do these aspects correctly
- Scrutinize existing examples for issues
- If your model has grammar, logic, or style issues, check if your data has any of the same issues. For instance, if the model now says "I will schedule this meeting for you" (when it shouldnt), see if existing examples teach the model to say it can do new things that it cant do
- Consider the balance and diversity of data
- If 60% of the assistant responses in the data says "I cannot answer this", but at inference time only 5% of responses should say that, you will likely get an overabundance of refusals
- Make sure your training examples contain all of the information needed for the response
- If we want the model to compliment a user based on their personal traits and a training example includes assistant compliments for traits not found in the preceding conversation, the model may learn to hallucinate information
- Look at the agreement / consistency in the training examples
- If multiple people created the training data, its likely that model performance will be limited by the level of agreement / consistency between people. For instance, in a text extraction task, if people only agreed on 70% of extracted snippets, the model would likely not be able to do better than this
- Make sure your all of your training examples are in the same format, as expected for inference
### Iterating on data quantity
Once youre satisfied with the quality and distribution of the examples, you can consider scaling up the number of training examples. This tends to help the model learn the task better, especially around possible "edge cases". We expect a similar amount of improvement every time you double the number of training examples. You can loosely estimate the expected quality gain from increasing the training data size by:
- Fine-tuning on your current dataset
- Fine-tuning on half of your current dataset
- Observing the quality gap between the two
In general, if you have to make a trade-off, a smaller amount of high-quality data is generally more effective than a larger amount of low-quality data.
### Iterating on hyperparameters
We allow you to specify the following hyperparameters:
- epochs
- learning rate multiplier
- batch size
We recommend initially training without specifying any of these, allowing us to pick a default for you based on dataset size, then adjusting if you observe the following:
- If the model does not follow the training data as much as expected increase the number of epochs by 1 or 2
- This is more common for tasks for which there is a single ideal completion (or a small set of ideal completions which are similar). Some examples include classification, entity extraction, or structured parsing. These are often tasks for which you can compute a final accuracy metric against a reference answer.
- If the model becomes less diverse than expected decrease the number of epochs by 1 or 2
- This is more common for tasks for which there are a wide range of possible good completions
- If the model does not appear to be converging, increase the learning rate multiplier
You can set the hyperparameters as is shown below:
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
client.fine_tuning.jobs.create(
training_file="file-abc123",
model="gpt-3.5-turbo",
hyperparameters={
"n_epochs":2
}
)
`.trim(),
"node.js": `
const fineTune = await openai.fineTuning.jobs.create({training_file: "file-abc123", model: "gpt-3.5-turbo", hyperparameters: { n_epochs: 2 }});
`.trim(),
}}
/>
## Fine-tuning examples
Now that we have explored the basics of the fine-tuning API, lets look at going through the fine-tuning lifecycle for a few different use cases.
In this example, we will explore how to build a fine-tuned model which gets the model follow specific style and tone guidance beyond what is possible with prompting alone.
To begin, we create a sample set of messages showing what the model should which in this case is misspelled words.
```jsonl
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "Paris, as if everyone doesn't know that already."}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "Oh, just some guy named William Shakespeare. Ever heard of him?"}]}
{"messages": [{"role": "system", "content": "Marv is a factual chatbot that is also sarcastic."}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "Around 384,400 kilometers. Give or take a few, like that really matters."}]}
```
If you want to follow along and create a fine-tuned model yourself, you will need at least 10 examples.
After getting the data that will potentially improve the model, the next step is to check if the data meets all the [formatting requirements](/docs/guides/fine-tuning/check-data-formatting).
Now that we have the data formatted and validated, the final training step is to kick off a job to create the fine-tuned model. You can do this via the OpenAI CLI or one of our SDKs as shown below:
```python
from openai import OpenAI
client = OpenAI()
file = client.files.create(
file=open("marv.jsonl", "rb"),
purpose="fine-tune"
)
client.fine_tuning.jobs.create(
training_file=file.id,
model="gpt-3.5-turbo"
)
```
Once the training job is done, you will be able to [use your fine-tuned model](/docs/guides/fine-tuning/use-a-fine-tuned-model).
Another type of use case which works really well with fine-tuning is getting the model to provide structured information, in this case about sports headlines:
```jsonl
{"messages": [{"role": "system", "content": "Given a sports headline, provide the following fields in a JSON dict, where applicable: \"player\" (full name), \"team\", \"sport\", and \"gender\"."}, {"role": "user", "content": "Sources: Colts grant RB Taylor OK to seek trade"}, {"role": "assistant", "content": "{\"player\": \"Jonathan Taylor\", \"team\": \"Colts\", \"sport\": \"football\", \"gender\": \"male\" }"}]}
{"messages": [{"role": "system", "content": "Given a sports headline, provide the following fields in a JSON dict, where applicable: \"player\" (full name), \"team\", \"sport\", and \"gender\"."}, {"role": "user", "content": "OSU 'split down middle' on starting QB battle"}, {"role": "assistant", "content": "{\"player\": null, \"team\": \"OSU\", \"sport\": \"football\", \"gender\": null }"}]}
```
If you want to follow along and create a fine-tuned model yourself, you will need at least 10 examples.
After getting the data that will potentially improve the model, the next step is to check if the data meets all the [formatting requirements](/docs/guides/fine-tuning/check-data-formatting).
Now that we have the data formatted and validated, the final training step is to kick off a job to create the fine-tuned model. You can do this via the OpenAI CLI or one of our SDKs as shown below:
```python
from openai import OpenAI
client = OpenAI()
file = client.files.create(
file=open("sports-context.jsonl", "rb"),
purpose="fine-tune"
)
client.fine_tuning.jobs.create(
training_file=file.id,
model="gpt-3.5-turbo"
)
```
Once the training job is done, you will be able to [use your fine-tuned model](/docs/guides/fine-tuning/use-a-fine-tuned-model) and make a request that looks like the following:
```python
completion = client.chat.completions.create(
model="ft:gpt-3.5-turbo:my-org:custom_suffix:id",
messages=[
{"role": "system", "content": "Given a sports headline, provide the following fields in a JSON dict, where applicable: player (full name), team, sport, and gender"},
{"role": "user", "content": "Richardson wins 100m at worlds to cap comeback"}
]
)
print(completion.choices[0].message)
```
Based on the formatted training data, the response should look like the following:
```json
{
"player": "Sha'Carri Richardson",
"team": null,
"sport": "track and field",
"gender": "female"
}
```
The chat completions API supports tool calling. Including a long list of tools in the completions API can consume a considerable number of prompt tokens and sometimes the model hallucinates or does not provide valid JSON output.
Fine-tuning a model with tool calling examples can allow you to:
- Get similarly formatted responses even when the full tool definition isn't present
- Get more accurate and consistent outputs
Format your examples as shown, with each line including a list of "messages" and an optional list of "tools":
```json
{
"messages": [
{ "role": "user", "content": "What is the weather in San Francisco?" },
{
"role": "assistant",
"tool_calls": [
{
"id": "call_id",
"type": "function",
"function": {
"name": "get_current_weather",
"arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"
}
}
]
}
],
"tools": [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and country, eg. San Francisco, USA"
},
"format": { "type": "string", "enum": ["celsius", "fahrenheit"] }
},
"required": ["location", "format"]
}
}
}
]
}
```
If you want to follow along and create a fine-tuned model yourself, you will need at least 10 examples.
If your goal is to use less tokens, some useful techniques are:
- Omit function and parameter descriptions: remove the description field from function and parameters
- Omit parameters: remove the entire properties field from the parameters object
- Omit function entirely: remove the entire function object from the functions array
If your goal is to maximize the correctness of the function calling output, we recommend using the same tool definitions for both training and querying the fine-tuned model.
Fine-tuning on function calling can also be used to customize the model's response to function outputs. To do this you can include a function response message and an assistant message interpreting that response:
```json
{
"messages": [
{"role": "user", "content": "What is the weather in San Francisco?"},
{"role": "assistant", "tool_calls": [{"id": "call_id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"}}]}
{"role": "tool", "tool_call_id": "call_id", "content": "21.0"},
{"role": "assistant", "content": "It is 21 degrees celsius in San Francisco, CA"}
],
"tools": [...] // same as before
}
```
Parallel function calling is enabled by default
and can be disabled by using
parallel_tool_calls: false
in the training example.
function_call and{" "}
functions have been deprecated in favor of{" "}
tools, it is recommended to use the{" "}
tools parameter instead.
The chat completions API supports function calling. Including a long list of functions in the completions API can consume a considerable number of prompt tokens and sometimes the model hallucinates or does not provide valid JSON output.
Fine-tuning a model with function calling examples can allow you to:
- Get similarly formatted responses even when the full function definition isn't present
- Get more accurate and consistent outputs
Format your examples as shown, with each line including a list of "messages" and an optional list of "functions":
```json
{
"messages": [
{ "role": "user", "content": "What is the weather in San Francisco?" },
{
"role": "assistant",
"function_call": {
"name": "get_current_weather",
"arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"
}
}
],
"functions": [
{
"name": "get_current_weather",
"description": "Get the current weather",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and country, eg. San Francisco, USA"
},
"format": { "type": "string", "enum": ["celsius", "fahrenheit"] }
},
"required": ["location", "format"]
}
}
]
}
```
If you want to follow along and create a fine-tuned model yourself, you will need at least 10 examples.
If your goal is to use less tokens, some useful techniques are:
- Omit function and parameter descriptions: remove the description field from function and parameters
- Omit parameters: remove the entire properties field from the parameters object
- Omit function entirely: remove the entire function object from the functions array
If your goal is to maximize the correctness of the function calling output, we recommend using the same function definitions for both training and querying the fine-tuned model.
Fine-tuning on function calling can also be used to customize the model's response to function outputs. To do this you can include a function response message and an assistant message interpreting that response:
```json
{
"messages": [
{"role": "user", "content": "What is the weather in San Francisco?"},
{"role": "assistant", "function_call": {"name": "get_current_weather", "arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}"}}
{"role": "function", "name": "get_current_weather", "content": "21.0"},
{"role": "assistant", "content": "It is 21 degrees celsius in San Francisco, CA"}
],
"functions": [...] // same as before
}
```
# Fine-tuning Integrations
OpenAI provides the ability for you to integrate your fine-tuning jobs with 3rd parties via our integration framework. Integrations generally allow you to track
job state, status, metrics, hyperparameters, and other job-related information in a 3rd party system. You can also use integrations to trigger actions in a 3rd party system based on job state changes. Currently, the only supported integration is with [Weights and Biases](https://wandb.ai), but more are coming soon.
## Weights and Biases Integration
[Weights and Biases (W&B)](https://wandb.ai) is a popular tool for tracking machine learning experiments. You can use the OpenAI integration with W&B to track your fine-tuning jobs in W&B. This integration will automatically log metrics, hyperparameters, and other job-related information to the W&B project you specify.
To integrate your fine-tuning jobs with W&B, you'll need to
1. Provide authentication credentials for your Weights and Biases account to OpenAI
2. Configure the W&B integration when creating new fine-tuning jobs
### Authenticate your Weights and Biases account with OpenAI
Authentication is done by submitting a valid W&B API key to OpenAI. Currently, this can only be done via the [Account Dashboard](https://platform.openai.com/account/organization), and only by account administrators. Your W&B API key will be stored encrypted within OpenAI and will allow OpenAI to post metrics and metadata on your behalf to W&B when your fine-tuning jobs are running. Attempting to enable a W&B integration on a fine-tuning job without first authenticating your OpenAI organization with WandB will result in an error.
<img
className="wandb-auth-image"
src="https://cdn.openai.com/API/images/guides/WandB_Integration.png"
/>
### Enable the Weights and Biases integration
When creating a new fine-tuning job, you can enable the W&B integration by including a new `"wandb"` integration under the `integrations` field in the job creation request. This integration allows you to specify the W&B Project that you wish the newly created W&B Run to show up under.
Here's an example of how to enable the W&B integration when creating a new fine-tuning job:
```curl
curl -X POST \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-3.5-turbo-0125",
"training_file": "file-ABC123",
"validation_file": "file-DEF456",
"integrations": [
{
"type": "wandb",
"wandb": {
"project": "custom-wandb-project",
"tags": ["project:tag", "lineage"]
}
}
]
}' https://api.openai.com/v1/fine_tuning/jobs
```
By default, the Run ID and Run display name are the ID of your fine-tuning job (e.g. `ftjob-abc123`). You can customize the display name of the run by including a `"name"` field in the `wandb` object. You can also include a `"tags"` field in the `wandb` object to add tags to the W&B Run (tags must be <= 64 character strings and there is a maximum of 50 tags).
Sometimes it is convenient to explicitly set the [W&B Entity](https://docs.wandb.ai/guides/runs/manage-runs#send-new-runs-to-a-team) to be associated with the run. You can do this by including an `"entity"` field in the `wandb` object. If you do not include an `"entity"` field, the W&B entity will default to the default W&B entity associated with the API key you registered previously.
The full specification for the integration can be found in our [fine-tuning job creation](/docs/api-reference/fine-tuning/create) documentation.
### View your fine-tuning job in Weights and Biases
Once you've created a fine-tuning job with the W&B integration enabled, you can view the job in W&B by navigating to the W&B project you specified in the job creation request. Your run should be located at the URL: `https://wandb.ai///runs/ftjob-ABCDEF`.
You should see a new run with the name and tags you specified in the job creation request. The Run Config will contain relevant job metadata such as:
- `model`: The model you are fine-tuning
- `training_file`: The ID of the training file
- `validation_file`: The ID of the validation file
- `hyperparameters`: The hyperparameters used for the job (e.g. `n_epochs`, `learning_rate`, `batch_size`)
- `seed`: The random seed used for the job
Likewise, OpenAI will set some default tags on the run to make it easier for your to search and filter. These tags will be prefixed with `"openai/"` and will include:
- `openai/fine-tuning`: Tag to let you know this run is a fine-tuning job
- `openai/ft-abc123`: The ID of the fine-tuning job
` `openai/gpt-3.5-turbo-0125`: The model you are fine-tuning
An example W&B run generated from an OpenAI fine-tuning job is shown below:
<img
className="wandb-auth-image"
src="https://cdn.openai.com/API/images/guides/WandB_Integration_Dashboard1.png"
/>
Metrics for each step of the fine-tuning job will be logged to the W&B run. These metrics are the same metrics provided in the [fine-tuning job event](/docs/api-reference/fine-tuning/list-events) object and are the same metrics your can view via the [OpenAI fine-tuning Dashboard](https://platform.openai.com/finetune). You can use W&B's visualization tools to track the progress of your fine-tuning job and compare it to other fine-tuning jobs you've run.
An example of the metrics logged to a W&B run is shown below:
<img
className="wandb-auth-image"
src="https://cdn.openai.com/API/images/guides/WandB_Integration_Dashboard2.png"
/>
# FAQ
### When should I use fine-tuning vs embeddings / retrieval augmented generation?
Embeddings with retrieval is best suited for cases when you need to have a large database of documents with relevant context and information.
By default OpenAIs models are trained to be helpful generalist assistants. Fine-tuning can be used to make a model which is narrowly focused, and exhibits specific ingrained behavior patterns. Retrieval strategies can be used to make new information available to a model by providing it with relevant context before generating its response. Retrieval strategies are not an alternative to fine-tuning and can in fact be complementary to it.
You can explore the differences between these options further in our Developer Day talk:
<iframe
width="100%"
height="315"
src="https://www.youtube-nocookie.com/embed/ahnGLM-RC1Y?si=cPQngClssVG_R2_q"
title="YouTube video player"
frameBorder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowFullScreen
>
### Can I fine-tune GPT-4o, GPT-4 Turbo or GPT-4?
GPT-4 fine-tuning is in experimental access and eligible developers can request access via the [fine-tuning UI](/finetune). GPT-4 fine-tuning is available for the `gpt-4-0613` and `gpt-4o-2024-05-13` models (not any of the `gpt-4-turbo` models).
`gpt-3.5-turbo-1106` and `gpt-3.5-turbo-0125` support up to 16K context examples.
### How do I know if my fine-tuned model is actually better than the base model?
We recommend generating samples from both the base model and the fine-tuned model on a test set of chat conversations, and comparing the samples side by side. For more comprehensive evaluations, consider using the [OpenAI evals framework](https://github.com/openai/evals) to create an eval specific to your use case.
### Can I continue fine-tuning a model that has already been fine-tuned?
Yes, you can pass the name of a fine-tuned model into the `model` parameter when creating a fine-tuning job. This will start a new fine-tuning job using the fine-tuned model as the starting point.
### How can I estimate the cost of fine-tuning a model?
Please refer to the [estimate cost](/docs/guides/fine-tuning/estimate-costs) section above.
### How many fine-tuning jobs can I have running at once?
Please refer to our [rate limit guide](/docs/guides/rate-limits/what-are-the-rate-limits-for-our-api) for the most up to date information on the limits.
### How do rate limits work on fine-tuned models?
A fine-tuned model pulls from the same shared rate limit as the model it is based off of. For example, if you use half your TPM rate limit in a given time period with the standard `gpt-3.5-turbo` model, any model(s) you fine-tuned from `gpt-3.5-turbo` would only have the remaining half of the TPM rate limit accessible since the capacity is shared across all models of the same type.
Put another way, having fine-tuned models does not give you more capacity to use our models from a total throughput perspective.
### Can I use the /v1/fine-tunes endpoint?
The `/v1/fine-tunes` endpoint has been deprecated in favor of the `/v1/fine_tuning/jobs` endpoint.
For users migrating from `/v1/fine-tunes` to the updated `/v1/fine_tuning/jobs` API and newer models, the main difference you can expect is the updated API. The legacy prompt completion pair data format has been retained for the updated `babbage-002` and `davinci-002` models to ensure a smooth transition. The new models will support fine-tuning with 4k token context and have a knowledge cutoff of September 2021.
For most tasks, you should expect to get better performance from `gpt-3.5-turbo` than from the GPT base models.

@ -0,0 +1,122 @@
Runs are asynchronous, which means you'll want to monitor their `status` by polling the Run object until a
[terminal status](https://platform.openai.com/docs/assistants/how-it-works/runs-and-run-steps) is reached. For convenience, the 'create and poll' SDK helpers assist both in
creating the run and then polling for its completion. Once the Run completes, you can list the
Messages added to the Thread by the Assistant. Finally, you would retrieve all the `tool_outputs` from
`required_action` and submit them at the same time to the 'submit tool outputs and poll' helper.
<CodeSample
defaultLanguage="python"
code={{
python: `
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id,
assistant_id=assistant.id,
)
if run.status == 'completed':
messages = client.beta.threads.messages.list(
thread_id=thread.id
)
print(messages)
else:
print(run.status)
# Define the list to store tool outputs
tool_outputs = []
# Loop through each tool in the required action section
for tool in run.required_action.submit_tool_outputs.tool_calls:
if tool.function.name == "get_current_temperature":
tool_outputs.append({
"tool_call_id": tool.id,
"output": "57"
})
elif tool.function.name == "get_rain_probability":
tool_outputs.append({
"tool_call_id": tool.id,
"output": "0.06"
})
# Submit all tool outputs at once after collecting them in a list
if tool_outputs:
try:
run = client.beta.threads.runs.submit_tool_outputs_and_poll(
thread_id=thread.id,
run_id=run.id,
tool_outputs=tool_outputs
)
print("Tool outputs submitted successfully.")
except Exception as e:
print("Failed to submit tool outputs:", e)
else:
print("No tool outputs to submit.")
if run.status == 'completed':
messages = client.beta.threads.messages.list(
thread_id=thread.id
)
print(messages)
else:
print(run.status)
`.trim(),
"node.js": `
const handleRequiresAction = async (run) => {
// Check if there are tools that require outputs
if (
run.required_action &&
run.required_action.submit_tool_outputs &&
run.required_action.submit_tool_outputs.tool_calls
) {
// Loop through each tool in the required action section
const toolOutputs = run.required_action.submit_tool_outputs.tool_calls.map(
(tool) => {
if (tool.function.name === "getCurrentTemperature") {
return {
tool_call_id: tool.id,
output: "57",
};
} else if (tool.function.name === "getRainProbability") {
return {
tool_call_id: tool.id,
output: "0.06",
};
}
},
);\n
// Submit all tool outputs at once after collecting them in a list
if (toolOutputs.length > 0) {
run = await client.beta.threads.runs.submitToolOutputsAndPoll(
thread.id,
run.id,
{ tool_outputs: toolOutputs },
);
console.log("Tool outputs submitted successfully.");
} else {
console.log("No tool outputs to submit.");
}\n
// Check status after submitting tool outputs
return handleRunStatus(run);
}
};\n
const handleRunStatus = async (run) => {
// Check if the run is completed
if (run.status === "completed") {
let messages = await client.beta.threads.messages.list(thread.id);
console.log(messages.data);
return messages.data;
} else if (run.status === "requires_action") {
console.log(run.status);
return await handleRequiresAction(run);
} else {
console.error("Run did not complete:", run);
}
};\n
// Create and poll run
let run = await client.beta.threads.runs.createAndPoll(thread.id, {
assistant_id: assistant.id,
});\n
handleRunStatus(run);
`.trim(),
}}
/>

@ -0,0 +1,124 @@
For the streaming case, we create an EventHandler class to handle events in the response stream and submit all tool outputs at once with the “submit tool outputs stream” helper in the Python and Node SDKs.
<CodeSample
defaultLanguage="python"
code={{
python: `
from typing_extensions import override
from openai import AssistantEventHandler
class EventHandler(AssistantEventHandler):
@override
def on_event(self, event):
# Retrieve events that are denoted with 'requires_action'
# since these will have our tool_calls
if event.event == 'thread.run.requires_action':
run_id = event.data.id # Retrieve the run ID from the event data
self.handle_requires_action(event.data, run_id)
def handle_requires_action(self, data, run_id):
tool_outputs = []
for tool in data.required_action.submit_tool_outputs.tool_calls:
if tool.function.name == "get_current_temperature":
tool_outputs.append({"tool_call_id": tool.id, "output": "57"})
elif tool.function.name == "get_rain_probability":
tool_outputs.append({"tool_call_id": tool.id, "output": "0.06"})
# Submit all tool_outputs at the same time
self.submit_tool_outputs(tool_outputs, run_id)
def submit_tool_outputs(self, tool_outputs, run_id):
# Use the submit_tool_outputs_stream helper
with client.beta.threads.runs.submit_tool_outputs_stream(
thread_id=self.current_run.thread_id,
run_id=self.current_run.id,
tool_outputs=tool_outputs,
event_handler=EventHandler(),
) as stream:
for text in stream.text_deltas:
print(text, end="", flush=True)
print()
with client.beta.threads.runs.stream(
thread_id=thread.id,
assistant_id=assistant.id,
event_handler=EventHandler()
) as stream:
stream.until_done()
`.trim(),
"node.js": `
class EventHandler extends EventEmitter {
constructor(client) {
super();
this.client = client;
}\n
async onEvent(event) {
try {
console.log(event);
// Retrieve events that are denoted with 'requires_action'
// since these will have our tool_calls
if (event.event === "thread.run.requires_action") {
await this.handleRequiresAction(
event.data,
event.data.id,
event.data.thread_id,
);
}
} catch (error) {
console.error("Error handling event:", error);
}
}\n
async handleRequiresAction(data, runId, threadId) {
try {
const toolOutputs =
data.required_action.submit_tool_outputs.tool_calls.map((toolCall) => {
if (toolCall.function.name === "getCurrentTemperature") {
return {
tool_call_id: toolCall.id,
output: "57",
};
} else if (toolCall.function.name === "getRainProbability") {
return {
tool_call_id: toolCall.id,
output: "0.06",
};
}
});
// Submit all the tool outputs at the same time
await this.submitToolOutputs(toolOutputs, runId, threadId);
} catch (error) {
console.error("Error processing required action:", error);
}
}\n
async submitToolOutputs(toolOutputs, runId, threadId) {
try {
// Use the submitToolOutputsStream helper
const stream = this.client.beta.threads.runs.submitToolOutputsStream(
threadId,
runId,
{ tool_outputs: toolOutputs },
);
for await (const event of stream) {
this.emit("event", event);
}
} catch (error) {
console.error("Error submitting tool outputs:", error);
}
}
}\n
const eventHandler = new EventHandler(client);
eventHandler.on("event", eventHandler.onEvent.bind(eventHandler));\n
const stream = await client.beta.threads.runs.stream(
threadId,
{ assistant_id: assistantId },
eventHandler,
);\n
for await (const event of stream) {
eventHandler.emit("event", event);
}
`.trim(),
}}
/>

@ -0,0 +1,252 @@
# Function calling
Learn how to connect large language models to external tools.
## Introduction
In an API call, you can describe functions and have the model intelligently choose to output a JSON object containing arguments to call one or many functions. The Chat Completions API does not call the function; instead, the model generates JSON that you can use to call the function in your code.
The latest models (`gpt-4o`, `gpt-4-turbo`, and `gpt-3.5-turbo`) have been trained to both detect when a function should to be called (depending on the input) and to respond with JSON that adheres to the function signature more closely than previous models. With this capability also comes potential risks. We strongly recommend building in user confirmation flows before taking actions that impact the world on behalf of users (sending an email, posting something online, making a purchase, etc).
This guide is focused on function calling with the Chat Completions API, for details
on function calling in the Assistants API, please see the{" "}
Assistants Tools page.
### Common use cases
Function calling allows you to more reliably get structured data back from the model. For example, you can:
- Create assistants that answer questions by calling external APIs
- e.g. define functions like `send_email(to: string, body: string)`, or `get_current_weather(location: string, unit: 'celsius' | 'fahrenheit')`
- Convert natural language into API calls
- e.g. convert "Who are my top customers?" to `get_customers(min_revenue: int, created_before: string, limit: int)` and call your internal API
- Extract structured data from text
- e.g. define a function called `extract_data(name: string, birthday: string)`, or `sql_query(query: string)`
...and much more!
The basic sequence of steps for function calling is as follows:
1. Call the model with the user query and a set of functions defined in the [functions parameter](/docs/api-reference/chat/create#chat/create-functions).
2. The model can choose to call one or more functions; if so, the content will be a stringified JSON object adhering to your custom schema (note: the model may hallucinate parameters).
3. Parse the string into JSON in your code, and call your function with the provided arguments if they exist.
4. Call the model again by appending the function response as a new message, and let the model summarize the results back to the user.
### Supported models
Not all model versions are trained with function calling data. Function calling is supported with the following models: `gpt-4o`, `gpt-4o-2024-05-13`, `gpt-4-turbo`, `gpt-4-turbo-2024-04-09`, `gpt-4-turbo-preview`, `gpt-4-0125-preview`, `gpt-4-1106-preview`, `gpt-4`, `gpt-4-0613`, `gpt-3.5-turbo`, `gpt-3.5-turbo-0125`, `gpt-3.5-turbo-1106`, and `gpt-3.5-turbo-0613`.
In addition, parallel function calls is supported on the following models: `gpt-4o`, `gpt-4o-2024-05-13`, `gpt-4-turbo`, `gpt-4-turbo-2024-04-09`, `gpt-4-turbo-preview`, `gpt-4-0125-preview`, `gpt-4-1106-preview`, `gpt-3.5-turbo-0125`, and `gpt-3.5-turbo-1106`.
## Function calling behavior
The default behavior for [`tool_choice`](/docs/api-reference/chat/create#chat-create-tool_choice) is `tool_choice: "auto"`. This lets the model decide whether to call functions and, if so, which functions to call.
We offer three ways to customize the default behavior depending on your use case:
1. To force the model to always call one or more functions, you can set `tool_choice: "required"`. The model will then select which function(s) to call.
2. To force the model to call only one specific function, you can set `tool_choice: {"type": "function", "function": {"name": "my_function"}}`.
3. To disable function calling and force the model to only generate a user-facing message, you can set `tool_choice: "none"`.
## Parallel function calling
Parallel function calling is the model's ability to perform multiple function calls together, allowing the effects and results of these function calls to be resolved in parallel. This is especially useful if functions take a long time, and reduces round trips with the API. For example, the model may call functions to get the weather in 3 different locations at the same time, which will result in a message with 3 function calls in the `tool_calls` array, each with an `id`. To respond to these function calls, add 3 new messages to the conversation, each containing the result of one function call, with a `tool_call_id` referencing the `id` from `tool_calls`.
Parallel function calling can be disabled by passing `parallel_tool_calls: false` in the request. The model will only call one function at a time when parallel function calling is disabled.
In this example, we define a single function `get_current_weather`. The model calls the function multiple times, and after sending the function response back to the model, we let it decide the next step. It responded with a user-facing message which was telling the user the temperature in San Francisco, Tokyo, and Paris. Depending on the query, it may choose to call a function again.
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
import json\n
client = OpenAI()\n
# Example dummy function hard coded to return the same weather
# In production, this could be your backend API or an external API
def get_current_weather(location, unit="fahrenheit"):
"""Get the current weather in a given location"""
if "tokyo" in location.lower():
return json.dumps({"location": "Tokyo", "temperature": "10", "unit": unit})
elif "san francisco" in location.lower():
return json.dumps({"location": "San Francisco", "temperature": "72", "unit": unit})
elif "paris" in location.lower():
return json.dumps({"location": "Paris", "temperature": "22", "unit": unit})
else:
return json.dumps({"location": location, "temperature": "unknown"})\n
def run_conversation():
# Step 1: send the conversation and available functions to the model
messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}]
tools = [
{
"type": "function",
"function": {
"name": "get_current_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g. San Francisco, CA",
},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
},
"required": ["location"],
},
},
}
]
response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
tools=tools,
tool_choice="auto", # auto is default, but we'll be explicit
)
response_message = response.choices[0].message
tool_calls = response_message.tool_calls
# Step 2: check if the model wanted to call a function
if tool_calls:
# Step 3: call the function
# Note: the JSON response may not always be valid; be sure to handle errors
available_functions = {
"get_current_weather": get_current_weather,
} # only one function in this example, but you can have multiple
messages.append(response_message) # extend conversation with assistant's reply
# Step 4: send the info for each function call and function response to the model
for tool_call in tool_calls:
function_name = tool_call.function.name
function_to_call = available_functions[function_name]
function_args = json.loads(tool_call.function.arguments)
function_response = function_to_call(
location=function_args.get("location"),
unit=function_args.get("unit"),
)
messages.append(
{
"tool_call_id": tool_call.id,
"role": "tool",
"name": function_name,
"content": function_response,
}
) # extend conversation with function response
second_response = client.chat.completions.create(
model="gpt-4o",
messages=messages,
) # get a new response from the model where it can see the function response
return second_response
print(run_conversation())
`.trim(),
"node.js": `
const openai = new OpenAI();
\n
// Example dummy function hard coded to return the same weather
// In production, this could be your backend API or an external API
function getCurrentWeather(location, unit = "fahrenheit") {
if (location.toLowerCase().includes("tokyo")) {
return JSON.stringify({ location: "Tokyo", temperature: "10", unit: "celsius" });
} else if (location.toLowerCase().includes("san francisco")) {
return JSON.stringify({ location: "San Francisco", temperature: "72", unit: "fahrenheit" });
} else if (location.toLowerCase().includes("paris")) {
return JSON.stringify({ location: "Paris", temperature: "22", unit: "fahrenheit" });
} else {
return JSON.stringify({ location, temperature: "unknown" });
}
}
\n
async function runConversation() {
// Step 1: send the conversation and available functions to the model
const messages = [
{ role: "user", content: "What's the weather like in San Francisco, Tokyo, and Paris?" },
];
const tools = [
{
type: "function",
function: {
name: "get_current_weather",
description: "Get the current weather in a given location",
parameters: {
type: "object",
properties: {
location: {
type: "string",
description: "The city and state, e.g. San Francisco, CA",
},
unit: { type: "string", enum: ["celsius", "fahrenheit"] },
},
required: ["location"],
},
},
},
];
\n
const response = await openai.chat.completions.create({
model: "gpt-4o",
messages: messages,
tools: tools,
tool_choice: "auto", // auto is default, but we'll be explicit
});
const responseMessage = response.choices[0].message;\n
// Step 2: check if the model wanted to call a function
const toolCalls = responseMessage.tool_calls;
if (responseMessage.tool_calls) {
// Step 3: call the function
// Note: the JSON response may not always be valid; be sure to handle errors
const availableFunctions = {
get_current_weather: getCurrentWeather,
}; // only one function in this example, but you can have multiple
messages.push(responseMessage); // extend conversation with assistant's reply
for (const toolCall of toolCalls) {
const functionName = toolCall.function.name;
const functionToCall = availableFunctions[functionName];
const functionArgs = JSON.parse(toolCall.function.arguments);
const functionResponse = functionToCall(
functionArgs.location,
functionArgs.unit
);
messages.push({
tool_call_id: toolCall.id,
role: "tool",
name: functionName,
content: functionResponse,
}); // extend conversation with function response
}
const secondResponse = await openai.chat.completions.create({
model: "gpt-4o",
messages: messages,
}); // get a new response from the model where it can see the function response
return secondResponse.choices;
}
}
\n
runConversation().then(console.log).catch(console.error);
`.trim(),
}}
/>
You can find more examples of function calling in the OpenAI Cookbook:
<a
href="https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models"
target="_blank"
>
} color="green" title="Function calling" className="mt-6">
Learn from more examples demonstrating function calling
### Tokens
Under the hood, functions are injected into the system message in a syntax the model has been trained on. This means functions count against the model's context limit and are billed as input tokens. If running into context limits, we suggest limiting the number of functions or the length of documentation you provide for function parameters.
It is also possible to use [fine-tuning](/docs/guides/fine-tuning/fine-tuning-examples) to reduce the number of tokens used if you have many functions defined.

@ -0,0 +1,334 @@
# Getting started
Creating an action for a GPT takes 3 steps:
1. Build an API
2. Document the API in the OpenAPI YAML or JSON format
3. Expose the Schema to your GPT in the ChatGPT UI
The focus of the rest of this section will be creating a TODO list GPT by defining the custom action for the GPT.
If you want to kickstart the process of creating your GPT schema, you can use the [experimental ActionsGPT](https://chatgpt.com/g/g-TYEliDU6A-actionsgpt). Keep in mind the [known limitation when building a GPT](/docs/actions/getting-started/limitations).
## Schema definition
Once you [create the basics](https://help.openai.com/en/articles/8554397-creating-a-gpt) of a TODO GPT, the next step is to build the [OpenAPI specification](https://swagger.io/specification/) to document the API. The model in ChatGPT is only aware of your API structure as defined in the schema. If your API is extensive, you don't need to expose all its functionality to the model; you can choose only specific endpoints to include. For example, if you have a social media API, you might want to have the model access content from the site through a GET request but prevent the model from being able to comment on users posts in order to reduce the chance of spam.
The OpenAPI specification is the wrapper that sits on top of your API. A basic OpenAPI specification will look like the following:
```
openapi: 3.0.1
info:
title: TODO Action
description: An action that allows the user to create and manage a TODO list using a GPT.
version: 'v1'
servers:
- url: https://example.com
paths:
/todos:
get:
operationId: getTodos
summary: Get the list of todos
responses:
"200":
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/getTodosResponse'
components:
schemas:
getTodosResponse:
type: object
properties:
todos:
type: array
items:
type: string
description: The list of todos.
```
We start by defining the specification version, the title, description, and version number. When a query is run in ChatGPT, it will look at the description that is defined in the info section to determine if the action is relevant for the user query. You can read more about prompting in the [writing descriptions](/docs/actions/getting-started/writing-descriptions) section.
Keep in mind the following limits in your OpenAPI specification, which are subject to change:
- 300 characters max for each API endpoint description/summary field in API specification
- 700 characters max for each API parameter description field in API specification
The OpenAPI specification follows the traditional OpenAPI format, you can [learn more about OpenAPI formatting](https://swagger.io/tools/open-source/getting-started/) and how it works. There are also many tools that auto generate OpenAPI specifications based on your underlying API code.
### Hosted OpenAPI specification
With Actions, we host the OpenAPI specification for your API in order to track changes. You can import an existing OpenAPI specification or create a new one from scratch using the [UI in the GPT creator](https://chatgpt.com/gpts/editor).
### Sending files
POST requests can include up to ten files (including DALL-E generated images) from the conversation. They will be sent as URLs which are valid for five minutes.
For files to be part of your POST request, the parameter must be named `openaiFileIdRefs` and the description should explain to the model the type and quantity of the files which your API is expecting.
The `openaiFileIdRefs` parameter will be populated with an array of JSON objects. Each object contains:
- `name` The name of the file. This will be an auto generated name when created by DALL-E.
- `id` A stable identifier for the file.
- `mime_type` The mime type of the file. For user uploaded files this is based on file extension.
- `download_link` The URL to fetch the file which is valid for five minutes.
Heres an example of an `openaiFileIdRefs` array with two elements:
```
[
{
"name": "dalle-Lh2tg7WuosbyR9hk",
"id": "file-XFlOqJYTPBPwMZE3IopCBv1Z",
"mime_type": "image/webp",
"download_link": "https://files.oaiusercontent.com/file-XFlOqJYTPBPwMZE3IopCBv1Z?se=2024-03-11T20%3A29%3A52Z&sp=r&sv=2021-08-06&sr=b&rscc=max-age%3D31536000%2C%20immutable&rscd=attachment%3B%20filename%3Da580bae6-ea30-478e-a3e2-1f6c06c3e02f.webp&sig=ZPWol5eXACxU1O9azLwRNgKVidCe%2BwgMOc/TdrPGYII%3D"
},
{
"name": "2023 Benefits Booklet.pdf",
"id": "file-s5nX7o4junn2ig0J84r8Q0Ew",
"mime_type": "application/pdf",
"download_link": "https://files.oaiusercontent.com/file-s5nX7o4junn2ig0J84r8Q0Ew?se=2024-03-11T20%3A29%3A52Z&sp=r&sv=2021-08-06&sr=b&rscc=max-age%3D299%2C%20immutable&rscd=attachment%3B%20filename%3D2023%2520Benefits%2520Booklet.pdf&sig=Ivhviy%2BrgoyUjxZ%2BingpwtUwsA4%2BWaRfXy8ru9AfcII%3D"
}
]
```
Actions can include files uploaded by the user, images generated by DALL-E, and files created by Code Interpreter.
#### OpenAPI Example
```
/createWidget:
post:
operationId: createWidget
summary: Creates a widget based on an image.
description: Uploads a file reference using its file id. This file should be an image created by DALL·E or uploaded by the user. JPG, WEBP, and PNG are supported for widget creation.
requestBody:
required: true
content:
application/json:
schema:
type: object
properties:
openaiFileIdRefs:
type: array
items:
type: string
```
While this schema shows `openaiFileIdRefs` as being an array of type `string`, at runtime this will be populated with an array of JSON objects as previously shown.
### Returning files
Requests may return up to 10 files. Each file may be up to 10 MB and cannot be an image or video.
These files will become part of the conversation similarly to if a user uploaded them, meaning they may be made available to code interpreter, file search, and sent as part of subsequent action invocations. In the web app users will see that the files have been returned and can download them.
To return files, the body of the response must contain an `openaiFileResponse` parameter. This parameter must always be an array and must be populated in one of two ways.
#### Inline option
Each element of the array is a JSON object which contains:
- `name` The name of the file. This will be visible to the user.
- `mime_type` The MIME type of the file. This is used to determine eligibility and which features have access to the file.
- `content` The base64 encoded contents of the file.
Heres an example of an openaiFileResponse array with two elements:
```
[
{
"name": "example_document.pdf",
"mime_type": "application/pdf",
"content": "JVBERi0xLjQKJcfsj6IKNSAwIG9iago8PC9MZW5ndGggNiAwIFIvRmlsdGVyIC9GbGF0ZURlY29kZT4+CnN0cmVhbQpHhD93PQplbmRzdHJlYW0KZW5kb2JqCg=="
},
{
"name": "sample_spreadsheet.csv",
"mime_type": "text/csv",
"content": "iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="
}
]
```
OpenAPI example
```
/papers:
get:
operationId: findPapers
summary: Retrieve PDFs of relevant academic papers.
description: Provided an academic topic, up to five relevant papers will be returned as PDFs.
parameters:
- in: query
name: topic
required: true
schema:
type: string
description: The topic the papers should be about.
responses:
'200':
description: Zero to five academic paper PDFs
content:
application/json:
schema:
type: object
properties:
openaiFileResponse:
type: array
items:
type: object
properties:
name:
type: string
description: The name of the file.
mime_type:
type: string
description: The MIME type of the file.
content:
type: string
format: byte
description: The content of the file in base64 encoding.
```
#### URL option
Each element of the array is a URL referencing a file to be downloaded. The headers `Content-Disposition` and `Content-Type` must be set such that a file name and MIME type can be determined. The name of the file will be visible to the user. The MIME type of the file determines eligibility and which features have access to the file.
There is a 10 second timeout for fetching each file.
Heres an example of an `openaiFileResponse` array with two elements:
```
[
"https://example.com/f/dca89f18-16d4-4a65-8ea2-ededced01646",
"https://example.com/f/01fad6b0-635b-4803-a583-0f678b2e6153"
]
```
Heres an example of the required headers for each URL:
```
Content-Type: application/pdf
Content-Disposition: attachment; filename="example_document.pdf"
```
OpenAPI example
```/papers:
get:
operationId: findPapers
summary: Retrieve PDFs of relevant academic papers.
description: Provided an academic topic, up to five relevant papers will be returned as PDFs.
parameters:
- in: query
name: topic
required: true
schema:
type: string
description: The topic the papers should be about.
responses:
'200':
description: Zero to five academic paper PDFs
content:
application/json:
schema:
type: object
properties:
openaiFileResponse:
type: array
items:
type: string
format: uri
description: URLs to fetch the files.
```
### Consequential flag
In the OpenAPI specification, you can now set certain endpoints as "consequential" as shown below:
```
paths:
/todo:
get:
operationId: getTODOs
description: Fetches items in a TODO list from the API.
security: []
post:
operationId: updateTODOs
description: Mutates the TODO list.
x-openai-isConsequential: true
```
A good example of a consequential action is booking a hotel room and paying for it on behalf of a user.
- If the `x-openai-isConsequential` field is `true`, we treat the operation as "must always prompt the user for confirmation before running" and don't show an "always allow" button (both are features of GPTs designed to give builders and users more control over actions).
- If the `x-openai-isConsequential` field is `false`, we show the "always allow button".
- If the field isn't present, we default all GET operations to `false` and all other operations to `true`
### Multiple authentication schemas
When defining an action, you can mix a single authentication type (OAuth or API key) along with endpoints that do not require authentication.
You can learn more about action authentication on our [actions authentication page](/docs/actions/authentication).
## Testing an action
In the GPT editor, once you have added an action, a new section below the schema will appear called "Available actions", this is generated by parsing the schema. You can preview the name, method, and path the action lives at. There will also be a "Test" button displayed which allows you to try your actions. After you press "Test", in the preview section of the GPT editor you will be presented with a request to "Allow", "Always allow", or "Decline" to run the action. These are user confirmations designed to given end users more control over what an action does.
There are also various debugging information made available inside the preview mode which should help you understand any unintended behavior. If everything is working as expected, you can save or update your GPT in the top right corner.
## Writing descriptions
When a user makes a query that might trigger an action, the model looks through the descriptions of the endpoints in the schema. Just like with prompting other language models, you will want to test out multiple prompts and descriptions to see what works best.
The schema is a great place to provide the model with detailed information about your API, such as the available functions and their parameters. Besides using expressive, informative names for each field, the schema can also contain "description" fields for every attribute. You can use these fields to provide natural language descriptions that explain what each method does or what information a query field requires. The model will be able to see these, and they will guide it in using the API. If a field is restricted to only certain values, you can also provide an "enum" with descriptive category names.
The instructions for a GPT gives you the freedom to instruct the GPT on how to use your action generally. Overall, the language model behind ChatGPT is highly capable of understanding natural language and following instructions. Therefore, this is a good place to put in general instructions on what your action does and how the GPT should use it properly. Use natural language, preferably in a concise yet descriptive and objective tone. You can look at some of the examples to have an idea of what this should look like.
### Best practices
Here are some best practices to follow when writing your GPT instructions and descriptions in your schema, as well as when designing your API responses:
1. Your descriptions should not encourage the GPT to use the action when the user hasn't asked for your action's particular category of service.
_Bad example_:
> Whenever the user mentions any type of task, ask if they would like to use the TODO action to add something to their todo list.
_Good example_:
> The TODO list can add, remove and view the user's TODOs.
2. Your descriptions should not prescribe specific triggers for the GPT to use the action. ChatGPT is designed to use your action automatically when appropriate.
_Bad example_:
> When the user mentions a task, respond with "Would you like me to add this to your TODO list? Say 'yes' to continue."
_Good example_:
> [no instructions needed for this]
3. Action responses from an API should return raw data instead of natural language responses unless it's necessary. The GPT will provide its own natural language response using the returned data.
_Bad example_:
> I was able to find your todo list! You have 2 todos: get groceries and walk the dog. I can add more todos if you'd like!
_Good example_:
> { "todos": [ "get groceries", "walk the dog" ] }
## Limitations
There are a few limitations to be aware of when building with actions:
- Custom headers are not supported
- With the exception of Google, Microsoft and Adobe OAuth domains, all domains used in an OAuth flow must be the same as the domain used for the primary endpoints
- Request and response payloads must be less than 100,000 characters each
- Requests timeout after 45 seconds
- Requests and responses can only contain text (no images or video)
If you have questions or run into additional limitations, you can join the discussion on the [OpenAI developer forum](https://community.openai.com).

@ -0,0 +1,39 @@
# GPTBot
GPTBot is OpenAIs web crawler and can be identified by the following [user agent](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent) and string.
```
User agent token: GPTBot
Full user-agent string: Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)
```
## Usage
Web pages crawled with the GPTBot user agent may potentially be used to improve future models and are filtered to remove sources that require paywall access, are known to primarily aggregate personally identifiable information (PII), or have text that violates our policies. Allowing GPTBot to access your site can help AI models become more accurate and improve their general capabilities and safety. Below, we also share how to disallow GPTBot from accessing your site.
### Disallowing GPTBot
To disallow GPTBot to access your site you can add the GPTBot to your sites robots.txt:
```
User-agent: GPTBot
Disallow: /
```
### Customize GPTBot access
To allow GPTBot to access only parts of your site you can add the GPTBot token to your sites robots.txt like this:
```
User-agent: GPTBot
Allow: /directory-1/
Disallow: /directory-2/
```
### GPTBot and ChatGPT-User
OpenAI has two separate user agents for web crawling and user browsing, so you know which use-case a given request is for. Our opt-out system currently treats both user agents the same, so any robots.txt disallow for one agent will cover both. [Read more about ChatGPT-User here](https://platform.openai.com/docs/plugins/bot).
### IP egress ranges
For OpenAI's crawler, calls to websites will be made from the IP address block documented on the [OpenAI website](https://openai.com/gptbot.json).

@ -0,0 +1,62 @@
# Hackathon support request
Thank you for your interest in having OpenAI support your developer hackathon. This page includes the relevant information about what types of events we support, how to request support, and more.
## Event criteria
We consider many event requests but tend to give priority to those that meet the following criteria:
- The event is developer-focused
- There is a public website for the event with information about the hackathon and a code of conduct displayed
- The event begins more than 21 days from the time of requesting support
- Post-event, wed kindly ask for feedback via survey
Due to high demand, we will not be able to support all requests. Please allow our team
7 days to review the request.
## Request support
In order to submit a request for support of your hackathon, you must be currently signed in via your OpenAI account, then you can use our [hackathon submission bot](#).
To view the status of your hackathon submission, make sure you are logged in and select "Help" in the top right corner of this page. Under "Messages", you will be able to see your hackathon submission. We will notify you as the status of your request changes during the review process.
## Process
After you submit a request, you can expect the following steps to take place:
1. Your request for hackathon support will be reviewed by the team (generally within 7 days).
2. We will respond to the ticket and specify if and how we are able to support the event.
- If you request credits, a speaker, and prize credits, we will respond with which of these we are able to accommodate.
3. If your request is approved, we will ask for the email and OrgID of the developers attending the event. This is required for us to grant credits.
4. Once we get this info, we can generally process the credits in 1-2 days.
## Branding guidelines
Once approved, we ask for the ability to review marketing materials that mention OpenAI or our support. We typically do not allow our logo to be used for marketing materials at hackathons.
We do allow hackathons to say that "credits are provided by OpenAI" or "supported by OpenAI" if they are selected to receive support.
## FAQ
#### Is there a limit to the number of people I can have at my event?
No. But for events with more than 1,000 people, we require a discussion with your event organizers so we can determine how best we can accommodate them.
#### Can I say that OpenAI is sponsoring my hackathon?
We reserve the “sponsoring” language for events that we have a sponsorship agreement with. We are okay with you describing our support as follows: "Credits are being provided by OpenAI" or “Supported by OpenAI”.
#### Does OpenAI do cash sponsorships of hackathons?
Not at this time.
#### Can someone from OpenAI speak at my hackathon?
Maybe! We love getting out and connecting with the developer community but have limited bandwidth to attend events.
#### Can my hackathon be virtual?
Yes, we support in-person, virtual, and hybrid hackathon events!

@ -0,0 +1,577 @@
How Assistants work Beta
The Assistants API is designed to help developers build powerful AI assistants capable of performing a variety of tasks.
<Notice
className="mt-2 mb-2"
icon={false}
color={NoticeColor.primary}
body={
The Assistants API is in beta and we are actively working on
adding more functionality. Share your feedback in our{" "}
Developer Forum!
}
textSize={NoticeTextSize.large}
dismissable={false}
/>
1. Assistants can call OpenAIs **[models](/docs/models)** with specific instructions to tune their personality and capabilities.
2. Assistants can access **multiple tools in parallel**. These can be both OpenAI-hosted tools — like [code_interpreter](/docs/assistants/tools/code-interpreter) and [file_search](/docs/assistants/tools/file-search) — or tools you build / host (via [function calling](/docs/assistants/tools/function-calling)).
3. Assistants can access **persistent Threads**. Threads simplify AI application development by storing message history and truncating it when the conversation gets too long for the models context length. You create a Thread once, and simply append Messages to it as your users reply.
4. Assistants can access files in several formats — either as part of their creation or as part of Threads between Assistants and users. When using tools, Assistants can also create files (e.g., images, spreadsheets, etc) and cite files they reference in the Messages they create.
## Objects
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-assistant.webp)
| Object | What it represents |
| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Assistant | Purpose-built AI that uses OpenAIs [models](/docs/models) and calls [tools](/docs/assistants/tools) |
| Thread | A conversation session between an Assistant and a user. Threads store Messages and automatically handle truncation to fit content into a models context. |
| Message | A message created by an Assistant or a user. Messages can include text, images, and other files. Messages stored as a list on the Thread. |
| Run | An invocation of an Assistant on a Thread. The Assistant uses its configuration and the Threads Messages to perform tasks by calling models and tools. As part of a Run, the Assistant appends Messages to the Thread. |
| Run Step | A detailed list of steps the Assistant took as part of a Run. An Assistant can call tools or create Messages during its run. Examining Run Steps allows you to introspect how the Assistant is getting to its final results. |
## Creating Assistants
We recommend using OpenAIs{" "}
latest models with the Assistants API
for best results and maximum compatibility with tools.
To get started, creating an Assistant only requires specifying the `model` to use. But you can further customize the behavior of the Assistant:
1. Use the `instructions` parameter to guide the personality of the Assistant and define its goals. Instructions are similar to system messages in the Chat Completions API.
2. Use the `tools` parameter to give the Assistant access to up to 128 tools. You can give it access to OpenAI-hosted tools like `code_interpreter` and `file_search`, or call a third-party tools via a `function` calling.
3. Use the `tool_resources` parameter to give the tools like `code_interpreter` and `file_search` access to files. Files are uploaded using the `File` [upload endpoint](/docs/api-reference/files/create) and must have the `purpose` set to `assistants` to be used with this API.
For example, to create an Assistant that can create data visualization based on a `.csv` file, first upload a file.
<CodeSample
defaultLanguage="python"
code={{
python: `
file = client.files.create(
file=open("revenue-forecast.csv", "rb"),
purpose='assistants'
)
`.trim(),
"node.js": `
const file = await openai.files.create({
file: fs.createReadStream("revenue-forecast.csv"),
purpose: "assistants",
});
`.trim(),
curl: `
curl https://api.openai.com/v1/files \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F purpose="assistants" \\
-F file="@revenue-forecast.csv"
`.trim(),
}}
/>
Then, create the Assistant with the `code_interpreter` tool enabled and provide the file as a resource to the tool.
<CodeSample
defaultLanguage="python"
code={{
python: `
assistant = client.beta.assistants.create(
name="Data visualizer",
description="You are great at creating beautiful data visualizations. You analyze data present in .csv files, understand trends, and come up with data visualizations relevant to those trends. You also share a brief text summary of the trends observed.",
model="gpt-4o",
tools=[{"type": "code_interpreter"}],
tool_resources={
"code_interpreter": {
"file_ids": [file.id]
}
}
)
`.trim(),
"node.js": `
const assistant = await openai.beta.assistants.create({
name: "Data visualizer",
description: "You are great at creating beautiful data visualizations. You analyze data present in .csv files, understand trends, and come up with data visualizations relevant to those trends. You also share a brief text summary of the trends observed.",
model: "gpt-4o",
tools: [{"type": "code_interpreter"}],
tool_resources: {
"code_interpreter": {
"file_ids": [file.id]
}
}
});
`.trim(),
curl: `
curl https://api.openai.com/v1/assistants \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"name": "Data visualizer",
"description": "You are great at creating beautiful data visualizations. You analyze data present in .csv files, understand trends, and come up with data visualizations relevant to those trends. You also share a brief text summary of the trends observed.",
"model": "gpt-4o",
"tools": [{"type": "code_interpreter"}],
"tool_resources": {
"code_interpreter": {
"file_ids": ["file-BK7bzQj3FfZFXr7DbL6xJwfo"]
}
}
}'
`.trim(),
}}
/>
You can attach a maximum of 20 files to `code_interpreter` and 10,000 files to `file_search` (using `vector_store` [objects](/docs/api-reference/vector-stores/object)).
Each file can be at most 512 MB in size and have a maximum of 5,000,000 tokens. By default, the size of all the files uploaded by your organization cannot exceed 100 GB, but you can reach out to our support team to increase this limit.
## Managing Threads and Messages
Threads and Messages represent a conversation session between an Assistant and a user. There is no limit to the number of Messages you can store in a Thread. Once the size of the Messages exceeds the context window of the model, the Thread will attempt to smartly truncate messages, before fully dropping the ones it considers the least important.
You can create a Thread with an initial list of Messages like this:
<CodeSample
defaultLanguage="python"
code={{
python: `
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": "Create 3 data visualizations based on the trends in this file.",
"attachments": [
{
"file_id": file.id,
"tools": [{"type": "code_interpreter"}]
}
]
}
]
)
`.trim(),
"node.js": `
const thread = await openai.beta.threads.create({
messages: [
{
"role": "user",
"content": "Create 3 data visualizations based on the trends in this file.",
"attachments": [
{
file_id: file.id,
tools: [{type: "code_interpreter"}]
}
]
}
]
});
`.trim(),
curl: `
curl https://api.openai.com/v1/threads \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"messages": [
{
"role": "user",
"content": "Create 3 data visualizations based on the trends in this file.",
"attachments": [
{
"file_id": "file-ACq8OjcLQm2eIG0BvRM4z5qX",
"tools": [{"type": "code_interpreter"}]
}
]
}
]
}'
`.trim(),
}}
/>
Messages can contain text, images, or file attachment. Message `attachments` are helper methods that add files to a thread's `tool_resources`. You can also choose to add files to the `thread.tool_resources` directly.
### Creating image input content
Message content can contain either external image URLs or File IDs uploaded via the [File API](/docs/api-reference/files/create). Only [models](/docs/models) with Vision support can accept image input. Supported image content types include png, jpg, gif, and webp. When creating image files, pass `purpose="vision"` to allow you to later download and display the input content. Currently, there is a 100GB limit per organization and 10GB for user in organization. Please contact us to request a limit increase.
Tools cannot access image content unless specified. To pass image files to Code Interpreter, add the file ID in the message `attachments` list to allow the tool to read and analyze the input. Image URLs cannot be downloaded in Code Interpreter today.
<CodeSample
defaultLanguage="python"
code={{
python: `
file = client.files.create(
file=open("myimage.png", "rb"),
purpose="vision"
)
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the difference between these images?"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.png"}
},
{
"type": "image_file",
"image_file": {"file_id": file.id}
},
],
}
]
)
`.trim(),
"node.js": `
const file = await openai.files.create({
file: fs.createReadStream("myimage.png"),
purpose: "vision",
});
const thread = await openai.beta.threads.create({
messages: [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the difference between these images?"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.png"}
},
{
"type": "image_file",
"image_file": {"file_id": file.id}
},
]
}
]
});
`.trim(),
curl: `
# Upload a file with an "vision" purpose
curl https://api.openai.com/v1/files \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F purpose="vision" \\
-F file="@/path/to/myimage.png"\n
## Pass the file ID in the content
curl https://api.openai.com/v1/threads \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is the difference between these images?"
},
{
"type": "image_url",
"image_url": {"url": "https://example.com/image.png"}
},
{
"type": "image_file",
"image_file": {"file_id": file.id}
}
]
}
]
}'
`.trim(),
}}
/>
#### Low or high fidelity image understanding
By controlling the `detail` parameter, which has three options, `low`, `high`, or `auto`, you have control over how the model processes the image and generates its textual understanding.
- `low` will enable the "low res" mode. The model will receive a low-res 512px x 512px version of the image, and represent the image with a budget of 85 tokens. This allows the API to return faster responses and consume fewer input tokens for use cases that do not require high detail.
- `high` will enable "high res" mode, which first allows the model to see the low res image and then creates detailed crops of input images based on the input image size. Use the [pricing calculator](https://openai.com/api/pricing/) to see token counts for various image sizes.
<CodeSample
defaultLanguage="python"
code={{
python: `
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is this an image of?"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.png",
"detail": "high"
}
},
],
}
]
)
`.trim(),
"node.js": `
const thread = await openai.beta.threads.create({
messages: [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is this an image of?"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.png",
"detail": "high"
}
},
]
}
]
});
`.trim(),
curl: `
curl https://api.openai.com/v1/threads \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is this an image of?"
},
{
"type": "image_url",
"image_url": {
"url": "https://example.com/image.png",
"detail": "high"
}
},
]
}
]
}'
`.trim(),
}}
/>
### Context window management
The Assistants API automatically manages the truncation to ensure it stays within the model's maximum context length. You can customize this behavior by specifying the maximum tokens you'd like a run to utilize and/or the maximum number of recent messages you'd like to include in a run.
#### Max Completion and Max Prompt Tokens
To control the token usage in a single Run, set `max_prompt_tokens` and `max_completion_tokens` when creating the Run. These limits apply to the total number of tokens used in all completions throughout the Run's lifecycle.
For example, initiating a Run with `max_prompt_tokens` set to 500 and `max_completion_tokens` set to 1000 means the first completion will truncate the thread to 500 tokens and cap the output at 1000 tokens. If only 200 prompt tokens and 300 completion tokens are used in the first completion, the second completion will have available limits of 300 prompt tokens and 700 completion tokens.
If a completion reaches the `max_completion_tokens` limit, the Run will terminate with a status of `incomplete`, and details will be provided in the `incomplete_details` field of the Run object.
When using the File Search tool, we recommend setting the max_prompt_tokens to no less
than 20,000. For longer conversations or multiple interactions with File Search,
consider increasing this limit to 50,000, or ideally, removing the max_prompt_tokens
limits altogether to get the highest quality results.
#### Truncation Strategy
You may also specify a truncation strategy to control how your thread should be rendered into the model's context window.
Using a truncation strategy of type `auto` will use OpenAI's default truncation strategy. Using a truncation strategy of type `last_messages` will allow you to specify the number of the most recent messages to include in the context window.
### Message annotations
Messages created by Assistants may contain [`annotations`](/docs/api-reference/messages/object#messages/object-content) within the `content` array of the object. Annotations provide information around how you should annotate the text in the Message.
There are two types of Annotations:
1. `file_citation`: File citations are created by the [`file_search`](/docs/assistants/tools/file-search) tool and define references to a specific file that was uploaded and used by the Assistant to generate the response.
2. `file_path`: File path annotations are created by the [`code_interpreter`](/docs/assistants/tools/code-interpreter) tool and contain references to the files generated by the tool.
When annotations are present in the Message object, you'll see illegible model-generated substrings in the text that you should replace with the annotations. These strings may look something like `【13†source】` or `sandbox:/mnt/data/file.csv`. Heres an example python code snippet that replaces these strings with information present in the annotations.
<CodeSample
defaultLanguage="python"
code={{
python: `
# Retrieve the message object
message = client.beta.threads.messages.retrieve(
thread_id="...",
message_id="..."
)
# Extract the message content
message_content = message.content[0].text
annotations = message_content.annotations
citations = []
# Iterate over the annotations and add footnotes
for index, annotation in enumerate(annotations):
# Replace the text with a footnote
message_content.value = message_content.value.replace(annotation.text, f' [{index}]')
# Gather citations based on annotation attributes
if (file_citation := getattr(annotation, 'file_citation', None)):
cited_file = client.files.retrieve(file_citation.file_id)
citations.append(f'[{index}] {file_citation.quote} from {cited_file.filename}')
elif (file_path := getattr(annotation, 'file_path', None)):
cited_file = client.files.retrieve(file_path.file_id)
citations.append(f'[{index}] Click to download {cited_file.filename}')
# Note: File download functionality not implemented above for brevity
# Add footnotes to the end of the message before displaying to user
message_content.value += '\\n' + '\\n'.join(citations)
`.trim(),
}}
/>
## Runs and Run Steps
When you have all the context you need from your user in the Thread, you can run the Thread with an Assistant of your choice.
<CodeSample
defaultLanguage="python"
code={{
python: `
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=assistant.id
)
`.trim(),
"node.js": `
const run = await openai.beta.threads.runs.create(
thread.id,
{ assistant_id: assistant.id }
);
`.trim(),
curl: `
curl https://api.openai.com/v1/threads/THREAD_ID/runs \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"assistant_id": "asst_ToSF7Gb04YMj8AMMm50ZLLtY"
}'
`.trim(),
}}
/>
By default, a Run will use the `model` and `tools` configuration specified in Assistant object, but you can override most of these when creating the Run for added flexibility:
<CodeSample
defaultLanguage="python"
code={{
python: `
run = client.beta.threads.runs.create(
thread_id=thread.id,
assistant_id=assistant.id,
model="gpt-4o",
instructions="New instructions that override the Assistant instructions",
tools=[{"type": "code_interpreter"}, {"type": "file_search"}]
)
`.trim(),
"node.js": `
const run = await openai.beta.threads.runs.create(
thread.id,
{
assistant_id: assistant.id,
model: "gpt-4o",
instructions: "New instructions that override the Assistant instructions",
tools: [{"type": "code_interpreter"}, {"type": "file_search"}]
}
);
`.trim(),
curl: `
curl https://api.openai.com/v1/threads/THREAD_ID/runs \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"assistant_id": "ASSISTANT_ID",
"model": "gpt-4o",
"instructions": "New instructions that override the Assistant instructions",
"tools": [{"type": "code_interpreter"}, {"type": "file_search"}]
}'
`.trim(),
}}
/>
Note: `tool_resources` associated with the Assistant cannot be overridden during Run creation. You must use the [modify Assistant](/docs/api-reference/assistants/modifyAssistant) endpoint to do this.
#### Run lifecycle
Run objects can have multiple statuses.
![Run lifecycle - diagram showing possible status transitions](https://cdn.openai.com/API/docs/images/diagram-run-statuses-v2.png)
| Status | Definition |
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `queued` | When Runs are first created or when you complete the `required_action`, they are moved to a queued status. They should almost immediately move to `in_progress`. |
| `in_progress` | While in_progress, the Assistant uses the model and tools to perform steps. You can view progress being made by the Run by examining the [Run Steps](/docs/api-reference/runs/step-object). |
| `completed` | The Run successfully completed! You can now view all Messages the Assistant added to the Thread, and all the steps the Run took. You can also continue the conversation by adding more user Messages to the Thread and creating another Run. |
| `requires_action` | When using the [Function calling](/docs/assistants/tools/function-calling) tool, the Run will move to a `required_action` state once the model determines the names and arguments of the functions to be called. You must then run those functions and [submit the outputs](/docs/api-reference/runs/submitToolOutputs) before the run proceeds. If the outputs are not provided before the `expires_at` timestamp passes (roughly 10 mins past creation), the run will move to an expired status. |
| `expired` | This happens when the function calling outputs were not submitted before `expires_at` and the run expires. Additionally, if the runs take too long to execute and go beyond the time stated in `expires_at`, our systems will expire the run. |
| `cancelling` | You can attempt to cancel an `in_progress` run using the [Cancel Run](/docs/api-reference/runs/cancelRun) endpoint. Once the attempt to cancel succeeds, status of the Run moves to `cancelled`. Cancellation is attempted but not guaranteed. |
| `cancelled` | Run was successfully cancelled. |
| `failed` | You can view the reason for the failure by looking at the `last_error` object in the Run. The timestamp for the failure will be recorded under `failed_at`. |
| `incomplete` | Run ended due to `max_prompt_tokens` or `max_completion_tokens` reached. You can view the specific reason by looking at the `incomplete_details` object in the Run. |
#### Polling for updates
If you are not using [streaming](/docs/assistants/overview/step-4-create-a-run?context=with-streaming), in order to keep the status of your run up to date, you will have to periodically [retrieve the Run](/docs/api-reference/runs/getRun) object. You can check the status of the run each time you retrieve the object to determine what your application should do next.
You can optionally use Polling Helpers in our [Node](https://github.com/openai/openai-node?tab=readme-ov-file#polling-helpers) and [Python](https://github.com/openai/openai-python?tab=readme-ov-file#polling-helpers) SDKs to help you with this. These helpers will automatically poll the Run object for you and return the Run object when it's in a terminal state.
#### Thread locks
When a Run is `in_progress` and not in a terminal state, the Thread is locked. This means that:
- New Messages cannot be added to the Thread.
- New Runs cannot be created on the Thread.
#### Run steps
![Run steps lifecycle - diagram showing possible status transitions](https://cdn.openai.com/API/docs/images/diagram-2.png)
Run step statuses have the same meaning as Run statuses.
Most of the interesting detail in the Run Step object lives in the `step_details` field. There can be two types of step details:
1. `message_creation`: This Run Step is created when the Assistant creates a Message on the Thread.
2. `tool_calls`: This Run Step is created when the Assistant calls a tool. Details around this are covered in the relevant sections of the [Tools](/docs/assistants/tools) guide.
## Data access guidance
Currently, Assistants, Threads, Messages, and Vector Stores created via the API are scoped to the Project they're created in. As such, any person with API key access to that Project is able to read or write Assistants, Threads, Messages, and Runs in the Project.
We strongly recommend the following data access controls:
- _Implement authorization._ Before performing reads or writes on Assistants, Threads, Messages, and Vector Stores, ensure that the end-user is authorized to do so. For example, store in your database the object IDs that the end-user has access to, and check it before fetching the object ID with the API.
- _Restrict API key access._ Carefully consider who in your organization should have API keys and be part of a Project. Periodically audit this list. API keys enable a wide range of operations including reading and modifying sensitive information, such as Messages and Files.
- _Create separate accounts._ Consider creating separate Projects for different applications in order to isolate data across multiple applications.
## Next
Now that you have explored how Assistants work, the next step is to explore [Assistant Tools](/docs/assistants/tools) which covers topics like Function calling, File Search, and Code Interpreter.

@ -0,0 +1,95 @@
### Using in-memory image data
The Node.js examples in the guide above use the `fs` module to read image data from disk. In some cases, you may have your image data in memory instead. Here's an example API call that uses image data stored in a Node.js `Buffer` object:
```javascript
const openai = new OpenAI();
// This is the Buffer object that contains your image data
const buffer = [your image data];
// Set a `name` that ends with .png so that the API knows it's a PNG image
buffer.name = "image.png";
async function main() {
const image = await openai.images.createVariation({ model: "dall-e-2", image: buffer, n: 1, size: "1024x1024" });
console.log(image.data);
}
main();
```
### Working with TypeScript
If you're using TypeScript, you may encounter some quirks with image file arguments. Here's an example of working around the type mismatch by explicitly casting the argument:
```javascript
const openai = new OpenAI();
async function main() {
// Cast the ReadStream to `any` to appease the TypeScript compiler
const image = await openai.images.createVariation({
image: fs.createReadStream("image.png") as any,
});
console.log(image.data);
}
main();
```
And here's a similar example for in-memory image data:
```javascript
const openai = new OpenAI();
// This is the Buffer object that contains your image data
const buffer: Buffer = [your image data];
// Cast the buffer to `any` so that we can set the `name` property
const file: any = buffer;
// Set a `name` that ends with .png so that the API knows it's a PNG image
file.name = "image.png";
async function main() {
const image = await openai.images.createVariation({
file,
1,
"1024x1024"
});
console.log(image.data);
}
main();
```
### Error handling
API requests can potentially return errors due to invalid inputs, rate limits, or other issues. These errors can be handled with a `try...catch` statement, and the error details can be found in either `error.response` or `error.message`:
```javascript
const openai = new OpenAI();
async function main() {
try {
const image = await openai.images.createVariation({
image: fs.createReadStream("image.png"),
n: 1,
size: "1024x1024",
});
console.log(image.data);
} catch (error) {
if (error.response) {
console.log(error.response.status);
console.log(error.response.data);
} else {
console.log(error.message);
}
}
}
main();
```

@ -0,0 +1,69 @@
### Using in-memory image data
The Python examples in the guide above use the `open` function to read image data from disk. In some cases, you may have your image data in memory instead. Here's an example API call that uses image data stored in a `BytesIO` object:
```python
from io import BytesIO
from openai import OpenAI
client = OpenAI()
# This is the BytesIO object that contains your image data
byte_stream: BytesIO = [your image data]
byte_array = byte_stream.getvalue()
response = client.images.create_variation(
image=byte_array,
n=1,
model="dall-e-2",
size="1024x1024"
)
```
### Operating on image data
It may be useful to perform operations on images before passing them to the API. Here's an example that uses `PIL` to resize an image:
```python
from io import BytesIO
from PIL import Image
from openai import OpenAI
client = OpenAI()
# Read the image file from disk and resize it
image = Image.open("image.png")
width, height = 256, 256
image = image.resize((width, height))
# Convert the image to a BytesIO object
byte_stream = BytesIO()
image.save(byte_stream, format='PNG')
byte_array = byte_stream.getvalue()
response = client.images.create_variation(
image=byte_array,
n=1,
model="dall-e-2",
size="1024x1024"
)
```
### Error handling
API requests can potentially return errors due to invalid inputs, rate limits, or other issues. These errors can be handled with a `try...except` statement, and the error details can be found in `e.error`:
```python
import openai
from openai import OpenAI
client = OpenAI()
try:
response = client.images.create_variation(
image=open("image_edit_mask.png", "rb"),
n=1,
model="dall-e-2",
size="1024x1024"
)
print(response.data[0].url)
except openai.OpenAIError as e:
print(e.http_status)
print(e.error)
```

@ -0,0 +1,230 @@
# Image generation
Learn how to generate or manipulate images with DALL·E in the API.
Looking to generate images in ChatGPT? Head to{" "}
chatgpt.com.
## Introduction
The Images API provides three methods for interacting with images:
1. Creating images from scratch based on a text prompt (DALL·E 3 and DALL·E 2)
2. Creating edited versions of images by having the model replace some areas of a pre-existing image, based on a new text prompt (DALL·E 2 only)
3. Creating variations of an existing image (DALL·E 2 only)
This guide covers the basics of using these three API endpoints with useful code samples. To try DALL·E 3, head to [ChatGPT](https://chatgpt.com/).
## Usage
### Generations
The [image generations](/docs/api-reference/images/create) endpoint allows you to create an original image given a text prompt. When using DALL·E 3, images can have a size of 1024x1024, 1024x1792 or 1792x1024 pixels.
By default, images are generated at `standard` quality, but when using DALL·E 3 you can set `quality: "hd"` for enhanced detail. Square, standard quality images are the fastest to generate.
You can request 1 image at a time with DALL·E 3 (request more by making parallel requests) or up to 10 images at a time using DALL·E 2 with the [n parameter](/docs/api-reference/images/create#images/create-n).
<CodeSample
title="Generate an image"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.images.generate(
model="dall-e-3",
prompt="a white siamese cat",
size="1024x1024",
quality="standard",
n=1,
)\n
image_url = response.data[0].url
`.trim(),
"node.js": `
const response = await openai.images.generate({
model: "dall-e-3",
prompt: "a white siamese cat",
n: 1,
size: "1024x1024",
});
image_url = response.data[0].url;
`.trim(),
curl: `
curl https://api.openai.com/v1/images/generations \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "dall-e-3",
"prompt": "a white siamese cat",
"n": 1,
"size": "1024x1024"
}'
`.trim(),
}}
/>
<IconItem
icon={}
color="red"
title="What is new with DALL·E 3"
className="mt-6"
>
Explore what is new with DALL·E 3 in the OpenAI Cookbook
## Prompting
With the release of DALL·E 3, the model now takes in the default prompt provided and automatically re-write it for safety reasons, and to add more detail (more detailed prompts generally result in higher quality images).
While it is not currently possible to disable this feature, you can use prompting to get outputs closer to your requested image by adding the following to your prompt: `I NEED to test how the tool works with extremely simple prompts. DO NOT add any detail, just use it AS-IS:`.
The updated prompt is visible in the `revised_prompt` field of the data response object.
## Example DALL·E 3 generations
| Prompt | Generation |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------- |
| A photograph of a white Siamese cat. | |
Each image can be returned as either a URL or Base64 data, using the [response_format](/docs/api-reference/images/create#images/create-response_format) parameter. URLs will expire after an hour.
### Edits (DALL·E 2 only)
Also known as "inpainting", the [image edits](/docs/api-reference/images/create-edit) endpoint allows you to edit or extend an image by uploading an image and mask indicating which areas should be replaced. The transparent areas of the mask indicate where the image should be edited, and the prompt should describe the full new image, **not just the erased area**. This endpoint can enable experiences like DALL·E image editing in ChatGPT Plus.
<CodeSample
title="Edit an image"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.images.edit((
model="dall-e-2",
image=open("sunlit_lounge.png", "rb"),
mask=open("mask.png", "rb"),
prompt="A sunlit indoor lounge area with a pool containing a flamingo",
n=1,
size="1024x1024"
)
image_url = response.data[0].url
`.trim(),
"node.js": `
const response = await openai.images.edit({
model: "dall-e-2",
image: fs.createReadStream("sunlit_lounge.png"),
mask: fs.createReadStream("mask.png"),
prompt: "A sunlit indoor lounge area with a pool containing a flamingo",
n: 1,
size: "1024x1024"
});
image_url = response.data[0].url;
`.trim(),
curl: `
curl https://api.openai.com/v1/images/edits \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F model="dall-e-2" \\
-F image="@sunlit_lounge.png" \\
-F mask="@mask.png" \\
-F prompt="A sunlit indoor lounge area with a pool containing a flamingo" \\
-F n=1 \\
-F size="1024x1024"
`.trim(),
}}
/>
| Image | Mask | Output |
| ---------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------- |
| | | |
Prompt: a sunlit indoor lounge area with a pool containing a flamingo
The uploaded image and mask must both be square PNG images less than 4MB in size, and also must have the same dimensions as each other. The non-transparent areas of the mask are not used when generating the output, so they dont necessarily need to match the original image like the example above.
### Variations (DALL·E 2 only)
The [image variations](/docs/api-reference/images/create-variation) endpoint allows you to generate a variation of a given image.
<CodeSample
title="Generate an image variation"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.images.create_variation(
model="dall-e-2",
image=open("corgi_and_cat_paw.png", "rb"),
n=1,
size="1024x1024"
)\n
image_url = response.data[0].url
`.trim(),
"node.js": `
const response = await openai.images.createVariation({
model: "dall-e-2",
image: fs.createReadStream("corgi_and_cat_paw.png"),
n: 1,
size: "1024x1024"
});
image_url = response.data[0].url;
`.trim(),
curl: `
curl https://api.openai.com/v1/images/variations \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F model="dall-e-2" \\
-F image="@corgi_and_cat_paw.png" \\
-F n=1 \\
-F size="1024x1024"
`.trim(),
}}
/>
| Image | Output |
| --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
| | |
Similar to the edits endpoint, the input image must be a square PNG image less than 4MB in size.
### Content moderation
Prompts and images are filtered based on our [content policy](https://labs.openai.com/policies/content-policy), returning an error when a prompt or image is flagged.
## Language-specific tips
<ContentSwitcher
options={[
{
value: "node",
label: "Node.js",
content: ,
},
{
value: "python",
label: "Python",
content: ,
},
]}
initialValue="node"
/>

@ -0,0 +1,71 @@
# Developer quickstart
## Get up and running with the OpenAI API
Looking for ChatGPT? Head to chatgpt.com.
The OpenAI API provides a simple interface for developers to create an intelligence layer in their applications, powered by OpenAI's state of the art models. The Chat Completions endpoint powers ChatGPT and provides a simple way to take text as input and use a model like GPT-4o to generate an output.
<IconItem
icon={}
color="green"
title="Want to jump straight to the code?"
className="mt-6"
>
Skip the quickstart and dive into the API reference.
This quickstart is designed to help get your local development environment set up and send your first API request. If you are an experienced developer or want to just dive into using the OpenAI API, the [API reference](/docs/api-reference) of [GPT guide](/docs/guides/text-generation) are a great place to start. Throughout this quickstart, you will learn:
- How to set up your development environment
- How to install the latest SDKs
- Some of the basic concepts of the OpenAI API
- How to send your first API request
If you run into any challenges or have questions getting started, please join our [developer forum](https://community.openai.com).
## Account setup
First, create an [OpenAI account](/signup) or [sign in](/login). Next, navigate to the [API key page](/account/api-keys) and "Create new secret key", optionally naming the key. Make sure to save this somewhere safe and do not share it with anyone.
## Quickstart language selection
Select the tool or language you want to get started using the OpenAI API with.
<ContentSwitcher
options={[
{
value: "curl",
label: "curl",
content: ,
},
{
value: "python",
label: "Python",
content: ,
},
{
value: "node",
label: "Node.js",
content: ,
},
]}
initialValue="python"
/>
## Next steps
Now that you have made you first OpenAI API request, it is time to explore what else is possible:
- For more detailed information on our models and the API, see our [text generation guide](/docs/guides/text-generation).
- Visit the [OpenAI Cookbook](https://cookbook.openai.com/) for in-depth example API use-cases, as well as code snippets for common tasks.
- Wondering what OpenAI's models are capable of? Check out our library of [example prompts](/examples).
- Want to try the API without writing any code? Start experimenting in the [Playground](/playground).
- Keep our [usage policies](https://openai.com/policies/usage-policies) in mind as you start building.

@ -0,0 +1,75 @@
# Actions in GPTs
## Introduction
Learn how to build a GPT action that intelligently calls your API.
### What is a GPT?
[GPTs](https://openai.com/blog/introducing-gpts) provide the ability to deeply customize ChatGPT for specific use cases along with custom capabilities. You can create a GPT that:
- Has custom instructions which determine the way the GPT interacts with users
- Includes tools like browsing, DALL·E, and Code Interpreter
- Comes with preset starter prompts for new and returning users
- Has custom actions which allow you to connect the GPT to APIs
And more! If you want to explore what is possible, check out the deep dive on GPTs from OpenAI Developer Day 2023:
<iframe
width="100%"
height="315"
src="https://www.youtube-nocookie.com/embed/pq34V_V5j18?si=q4ZPUe-dS8Ii8YX0"
title="YouTube video player"
frameBorder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowFullScreen
>
### What is an action in a GPT?
In addition to using our built-in capabilities (browsing, DALL·E, and Code Interpreter), you can also define custom actions by making one or more APIs available to the GPT. Actions allow GPTs to integrate external data or interact with the real-world, such as connecting GPTs to databases, plugging them into your emails, or making them your shopping assistant, all through APIs.
The design of actions builds upon insights from our plugins beta, granting developers greater control over the model and how their APIs are called. Actions are defined using the [OpenAPI specification](https://swagger.io/specification/), which is a standard for describing APIs.
### GPT action flow
To build a GPT with an action, it is important to understand the end-to-end flow.
1. Create a GPT in the ChatGPT UI
- Manually configure or use the GPT builder to create a GPT
- Identify the API(s) you want to use
2. Go to the "Configure" tab in the GPT editor and select "Create new action"
- You will be presented with 3 main options: selecting the authentication schema for the action, inputting the schema itself, and setting the privacy policy URL
- The Schema follows the OpenAPI specification format (not to be confused with OpenAI) to define how the GPT can access an external API
3. Fill in the details for the schema, authentication, and privacy policy.
- When selecting an authentication method, you will have 3 options, "None", "API Key", and "OAuth", we will explore these in depth later on
- For the schema, you can take an existing OpenAPI specification you have for your API or create a new one. If you have already published an OpenAPI specification online, you can import it via the "Import from URL" button
- The privacy policy URL is displayed to the user when they open a GPT and select the drop down in the top left corner showing the name of the GPT
4. Determine the visibility of your GPT
- By default, GPTs are not accessible to everyone
- When you go to save a GPT, you will have the option to "Publish to" a certain audience: "Only me", "Anyone with a link", or "Everyone"
- Each of these visibility options comes with different constraints and requirements. For example the naming of a GPT has more restrictions if you share it with someone else
5. User(s) engage with your GPT
- Depending on the visibility of your GPT, users might try it via a link you shared, or find it in the GPT store
- If OAuth is required, users will be prompted to login during the session
- Behind the scenes, the GPT injects the information on how you configured the GPT (including any available actions, tools, or instructions) into the context of the model
- Each time a user makes a request, the model sees the available tools, actions, and instructions which determine how the GPT will respond to the request
- If the user request is to check the weather in a specific location and you made a "Check weather" action available, the model will follow the OpenAPI specification you provided to send a request to that API and return the response to the user
## Next steps
Now that you know the basics of how a GPT works and where actions can be used, you might want to:
- Get started building a [GPT with an action](/docs/actions/getting-started)
- Learn how we [built the GPT builder](https://help.openai.com/en/articles/8770868-gpt-builder) itself as a custom GPT with actions
- Familiarize yourself with our [GPT polices](https://openai.com/policies/usage-policies#:~:text=or%20educational%20purposes.-,Building%20with%20ChatGPT,-Shared%20GPTs%20allow)
- Explore the [differences between GPTs and Assistants](https://help.openai.com/en/articles/8673914-gpts-vs-assistants)
- Check out the [GPT data privacy FAQ's](https://help.openai.com/en/articles/8554402-gpts-data-privacy-faqs)
- Find answers to [common GPT questions](https://help.openai.com/en/articles/8554407-gpts-faq)

@ -0,0 +1,452 @@
# Latency optimization
This guide covers the core set of principles you can apply to improve latency across a wide variety of LLM-related use cases. These techniques come from working with a wide range of customers and developers on production applications, so they should apply regardless of what you're building from a granular workflow to an end-to-end chatbot!
While there's many individual techniques, we'll be grouping them into **seven principles** meant to represent a high-level taxonomy of approaches for improving latency.
At the end, we'll walk through an [example](/docs/guides/latency-optimization/example) to see how they can be applied.
## The seven principles
1. [Process tokens faster.](/docs/guides/latency-optimization/1-process-tokens-faster)
2. [Generate fewer tokens.](/docs/guides/latency-optimization/2-generate-fewer-tokens)
3. [Use fewer input tokens.](/docs/guides/latency-optimization/3-use-fewer-input-tokens)
4. [Make fewer requests.](/docs/guides/latency-optimization/4-make-fewer-requests)
5. [Parallelize.](/docs/guides/latency-optimization/5-parallelize)
6. [Make your users wait less.](/docs/guides/latency-optimization/6-make-your-users-wait-less)
7. [Don't default to an LLM.](/docs/guides/latency-optimization/7-don-t-default-to-an-llm)
You can use the friendly, catchy acronym PGIRPWD to remember these. (Processing,
Generation, Input, Requests, Parallelize, Waiting, Don't)
### 1. Process tokens faster
**Inference speed** is probably the first thing that comes to mind when addressing latency (but as you'll see soon, it's far from the only one). This refers to the actual **rate at which the LLM processes tokens**, and is often measured in TPM (tokens per minute) or TPS (tokens per second).
The main factor that influences inference speed is **model size** smaller models usually run faster (and cheaper), and when used correctly can even outperform larger models. To maintain high quality performance with smaller models you can explore:
- using a longer, [more detailed prompt](/docs/guides/prompt-engineering/tactic-specify-the-steps-required-to-complete-a-task),
- adding (more) [few-shot examples](/docs/guides/prompt-engineering/tactic-provide-examples), or
- [fine-tuning](/docs/guides/fine-tuning) / distillation.
Other factors that affect inference speed are the amount of compute you have
available and any additional inference optimizations you employ.
Most people can't influence these factors directly, but if you're curious, and have some
control over your infra, faster hardware or
running engines at a lower saturation
may give you a modest TPM boost. And if you're down in the trenches, there's a myriad
of
inference optimizations
that are a bit beyond the scope of this guide.
### 2. Generate fewer tokens
Generating tokens is almost always the highest latency step when using an LLM: as a general heuristic, **cutting 50% of your output tokens may cut ~50% your latency**. The way you reduce your output size will depend on output type:
If you're generating **natural language**, simply **asking the model to be more concise** ("under 20 words" or "be very brief") may help. You can also use few shot examples and/or fine-tuning to teach the model shorter responses.
If you're generating **structured output**, try to **minimize your output syntax** where possible: shorten function names, omit named arguments, coalesce parameters, etc.
Finally, while not common, you can also use `max_tokens` or `stop_tokens` to end your generation early.
Always remember: an output token cut is a (milli)second earned!
### 3. Use fewer input tokens
While reducing the number of input tokens does result in lower latency, this is not usually a significant factor **cutting 50% of your prompt may only result in a 1-5% latency improvement**. Unless you're working with truly massive context sizes (documents, images), you may want to spend your efforts elsewhere.
That being said, if you _are_ working with massive contexts (or you're set on squeezing every last bit of performance _and_ you've exhausted all other options) you can use the following techniques to reduce your input tokens:
- **Fine-tuning the model**, to replace the need for lengthy instructions / examples.
- **Filtering context input**, like pruning RAG results, cleaning HTML, etc.
- **Maximize shared prompt prefix**, by putting dynamic portions (e.g. RAG results, history, etc) later in the prompt. This makes your request more [KV cache](https://medium.com/@joaolages/kv-caching-explained-276520203249)-friendly (which most LLM providers use) and means fewer input tokens are processed on each request. ([why?](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/))
### 4. Make fewer requests
Each time you make a request you incur some round-trip latency this can start to add up.
If you have sequential steps for the LLM to perform, instead of firing off one request per step consider **putting them in a single prompt and getting them all in a single response**. You'll avoid the additional round-trip latency, and potentially also reduce complexity of processing multiple responses.
An approach to doing this is by collecting your steps in an enumerated list in the combined prompt, and then requesting the model to return the results in named fields in a JSON. This way you can easily parse out and reference each result!
### 5. Parallelize
Parallelization can be very powerful when performing multiple steps with an LLM.
If the steps **are _not_ strictly sequential**, you can **split them out into parallel calls**. Two shirts take just as long to dry as one.
If the steps **_are_ strictly sequential**, however, you might still be able to **leverage speculative execution**. This is particularly effective for classification steps where one outcome is more likely than the others (e.g. moderation).
1. Start step 1 & step 2 simultaneously (e.g. input moderation & story generation)
2. Verify the result of step 1
3. If result was not the expected, cancel step 2 (and retry if necessary)
If your guess for step 1 is right, then you essentially got to run it with zero added latency!
### 6. Make your users wait less
There's a huge difference between **waiting** and **watching progress happen** make sure your users experience the latter. Here are a few techniques:
- **Streaming**: The single most effective approach, as it cuts the _waiting_ time to a second or less. (ChatGPT would feel pretty different if you saw nothing until each response was done.)
- **Chunking**: If your output needs further processing before being shown to the user (moderation, translation) consider **processing it in chunks** instead of all at once. Do this by streaming to your backend, then sending processed chunks to your frontend.
- **Show your steps**: If you're taking multiple steps or using tools, surface this to the user. The more real progress you can show, the better.
- **Loading states**: Spinners and progress bars go a long way.
Note that while **showing your steps & having loading states** have a mostly
psychological effect, **streaming & chunking** genuinely do reduce overall
latency once you consider the app + user system: the user will finish reading a response
sooner.
### 7. Don't default to an LLM
LLMs are extremely powerful and versatile, and are therefore sometimes used in cases where a **faster classical method** would be more appropriate. Identifying such cases may allow you to cut your latency significantly. Consider the following examples:
- **Hard-coding:** If your **output** is highly constrained, you may not need an LLM to generate it. Action confirmations, refusal messages, and requests for standard input are all great candidates to be hard-coded. (You can even use the age-old method of coming up with a few variations for each.)
- **Pre-computing:** If your **input** is constrained (e.g. category selection) you can generate multiple responses in advance, and just make sure you never show the same one to a user twice.
- **Leveraging UI:** Summarized metrics, reports, or search results are sometimes better conveyed with classical, bespoke UI components rather than LLM-generated text.
- **Traditional optimization techniques:** An LLM application is still an application; binary search, caching, hash maps, and runtime complexity are all _still_ useful in a world of LLMs.
## Example
Let's now look at a sample application, identify potential latency optimizations, and propose some solutions!
We'll be analyzing the architecture and prompts of a hypothetical customer service bot inspired by real production applications. The [architecture and prompts](/docs/guides/latency-optimization/architecture-and-prompts) section sets the stage, and the [analysis and optimizations](/docs/guides/latency-optimization/analysis-and-optimizations) section will walk through the latency optimization process.
You'll notice this example doesn't cover every single principle, much like real-world
use cases don't require applying every technique.
### Architecture and prompts
The following is the **initial architecture** for a hypothetical **customer service bot**. This is what we'll be making changes to.
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-0.png)
At a high level, the diagram flow describes the following process:
1. A user sends a message as part of an ongoing conversation.
2. The last message is turned into a **self-contained query** (see examples in prompt).
3. We determine whether or not **additional (retrieved) information is required** to respond to that query.
4. **Retrieval** is performed, producing search results.
5. The assistant **reasons** about the user's query and search results, and **produces a response**.
6. The response is sent back to the user.
Below are the prompts used in each part of the diagram. While they are still only hypothetical and simplified, they are written with the same structure and wording that you would find in a production application.
Places where you see placeholders like " [user input here] "
represent dynamic portions, that would be replaced by actual data at runtime.
Re-writes user query to be a self-contained search query.
```example-chat
SYSTEM: Given the previous conversation, re-write the last user query so it contains
all necessary context.
# Example
History: [{user: "What is your return policy?"},{assistant: "..."}]
User Query: "How long does it cover?"
Response: "How long does the return policy cover?"
# Conversation
[last 3 messages of conversation]
# User Query
[last user query]
USER: [JSON-formatted input conversation here]
```
Determines whether a query requires performing retrieval to respond.
```example-chat
SYSTEM: Given a user query, determine whether it requires doing a realtime lookup to
respond to.
# Examples
User Query: "How can I return this item after 30 days?"
Response: "true"
User Query: "Thank you!"
Response: "false"
USER: [input user query here]
```
Fills the fields of a JSON to reason through a pre-defined set of steps to produce a final response given a user conversation and relevant retrieved information.
```example-chat
SYSTEM: You are a helpful customer service bot.
Use the result JSON to reason about each user query - use the retrieved context.
# Example
User: "My computer screen is cracked! I want it fixed now!!!"
Assistant Response:
{
"message_is_conversation_continuation": "True",
"number_of_messages_in_conversation_so_far": "1",
"user_sentiment": "Aggravated",
"query_type": "Hardware Issue",
"response_tone": "Validating and solution-oriented",
"response_requirements": "Propose options for repair or replacement.",
"user_requesting_to_talk_to_human": "False",
"enough_information_in_context": "True"
"response": "..."
}
USER: # Relevant Information
` ` `
[retrieved context]
` ` `
USER: [input user query here]
```
### Analysis and optimizations
#### Part 1: Looking at retrieval prompts
Looking at the architecture, the first thing that stands out is the **consecutive GPT-4 calls** - these hint at a potential inefficiency, and can often be replaced by a single call or parallel calls.
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-2.png)
In this case, since the check for retrieval requires the contextualized query, let's **combine them into a single prompt** to [make fewer requests](/docs/guides/latency-optimization/4-make-fewer-requests).
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-3.png)
**What changed?** Before, we had one prompt to re-write the query and one to determine whether this requires doing a retrieval lookup. Now, this combined prompt does both. Specifically, notice the updated instruction in the first line of the prompt, and the updated output JSON:
```jsx
{
query:"[contextualized query]",
retrieval:"[true/false - whether retrieval is required]"
}
```
```example-chat
SYSTEM: Given the previous conversation, re-write the last user query so it contains
all necessary context. Then, determine whether the full request requires doing a
realtime lookup to respond to.
Respond in the following form:
{
query:"[contextualized query]",
retrieval:"[true/false - whether retrieval is required]"
}
# Examples
History: [{user: "What is your return policy?"},{assistant: "..."}]
User Query: "How long does it cover?"
Response: {query: "How long does the return policy cover?", retrieval: "true"}
History: [{user: "How can I return this item after 30 days?"},{assistant: "..."}]
User Query: "Thank you!"
Response: {query: "Thank you!", retrieval: "false"}
# Conversation
[last 3 messages of conversation]
# User Query
[last user query]
USER: [JSON-formatted input conversation here]
```
Actually, adding context and determining whether to retrieve are very straightforward and well defined tasks, so we can likely use a **smaller, fine-tuned model** instead. Switching to GPT-3.5 will let us [process tokens faster](/docs/guides/latency-optimization/1-process-tokens-faster).
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-4.png)
#### Part 2: Analyzing the assistant prompt
Let's now direct our attention to the Assistant prompt. There seem to be many distinct steps happening as it fills the JSON fields this could indicate an opportunity to [parallelize](/docs/guides/latency-optimization/5-parallelize).
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-5.png)
However, let's pretend we have run some tests and discovered that splitting the reasoning steps in the JSON produces worse responses, so we need to explore different solutions.
**Could we use a fine-tuned GPT-3.5 instead of GPT-4?** Maybe but in general, open-ended responses from assistants are best left to GPT-4 so it can better handle a greater range of cases. That being said, looking at the reasoning steps themselves, they may not all require GPT-4 level reasoning to produce. The well defined, limited scope nature makes them and **good potential candidates for fine-tuning**.
```jsx
{
"message_is_conversation_continuation": "True", // <-
"number_of_messages_in_conversation_so_far": "1", // <-
"user_sentiment": "Aggravated", // <-
"query_type": "Hardware Issue", // <-
"response_tone": "Validating and solution-oriented", // <-
"response_requirements": "Propose options for repair or replacement.", // <-
"user_requesting_to_talk_to_human": "False", // <-
"enough_information_in_context": "True" // <-
"response": "..." // X -- benefits from GPT-4
}
```
This opens up the possibility of a trade-off. Do we keep this as a **single request entirely generated by GPT-4**, or **split it into two sequential requests** and use GPT-3.5 for all but the final response? We have a case of conflicting principles: the first option lets us [make fewer requests](/docs/guides/latency-optimization/4-make-fewer-requests), but the second may let us [process tokens faster](/docs/guides/latency-optimization/1-process-tokens-faster).
As with many optimization tradeoffs, the answer will depend on the details. For example:
- The proportion of tokens in the `response` vs the other fields.
- The average latency decrease from processing most fields faster.
- The average latency _increase_ from doing two requests instead of one.
The conclusion will vary by case, and the best way to make the determiation is by testing this with production examples. In this case let's pretend the tests indicated it's favorable to split the prompt in two to [process tokens faster](/docs/guides/latency-optimization/1-process-tokens-faster).
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-6.png)
**Note:** We'll be grouping `response` and `enough_information_in_context` together in the second prompt to avoid passing the retrieved context to both new prompts.
This prompt will be passed to GPT-3.5 and can be fine-tuned on curated examples.
**What changed?** The "enough_information_in_context" and "response" fields were removed, and the retrieval results are no longer loaded into this prompt.
```example-chat
SYSTEM: You are a helpful customer service bot.
Based on the previous conversation, respond in a JSON to determine the required
fields.
# Example
User: "My freaking computer screen is cracked!"
Assistant Response:
{
"message_is_conversation_continuation": "True",
"number_of_messages_in_conversation_so_far": "1",
"user_sentiment": "Aggravated",
"query_type": "Hardware Issue",
"response_tone": "Validating and solution-oriented",
"response_requirements": "Propose options for repair or replacement.",
"user_requesting_to_talk_to_human": "False",
}
```
This prompt will be processed by GPT-4 and will receive the reasoning steps determined in the prior prompt, as well as the results from retrieval.
**What changed?** All steps were removed except for "enough_information_in_context" and "response". Additionally, the JSON we were previously filling in as output will be passed in to this prompt.
```example-chat
SYSTEM: You are a helpful customer service bot.
Use the retrieved context, as well as these pre-classified fields, to respond to
the user's query.
# Reasoning Fields
` ` `
[reasoning json determined in previous GPT-3.5 call]
` ` `
# Example
User: "My freaking computer screen is cracked!"
Assistant Response:
{
"enough_information_in_context": "True"
"response": "..."
}
USER: # Relevant Information
` ` `
[retrieved context]
` ` `
```
In fact, now that the reasoning prompt does not depend on the retrieved context we can [parallelize](/docs/guides/latency-optimization/5-parallelize) and fire it off at the same time as the retrieval prompts.
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-6b.png)
#### Part 3: Optimizing the structured output
Let's take another look at the reasoning prompt.
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-7b.png)
Taking a closer look at the reasoning JSON you may notice the field names themselves are quite long.
```jsx
{
"message_is_conversation_continuation": "True", // <-
"number_of_messages_in_conversation_so_far": "1", // <-
"user_sentiment": "Aggravated", // <-
"query_type": "Hardware Issue", // <-
"response_tone": "Validating and solution-oriented", // <-
"response_requirements": "Propose options for repair or replacement.", // <-
"user_requesting_to_talk_to_human": "False", // <-
}
```
By making them shorter and moving explanations to the comments we can [generate fewer tokens](/docs/guides/latency-optimization/2-generate-fewer-tokens).
```jsx
{
"cont": "True", // whether last message is a continuation
"n_msg": "1", // number of messages in the continued conversation
"tone_in": "Aggravated", // sentiment of user query
"type": "Hardware Issue", // type of the user query
"tone_out": "Validating and solution-oriented", // desired tone for response
"reqs": "Propose options for repair or replacement.", // response requirements
"human": "False", // whether user is expressing want to talk to human
}
```
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-8b.png)
This small change removed 19 output tokens. While with GPT-3.5 this may only result in a few millisecond improvement, with GPT-4 this could shave off up to a second.
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/token-counts-latency-customer-service-large.png)
You might imagine, however, how this can have quite a significant impact for larger model outputs.
We could go further and use single chatacters for the JSON fields, or put everything in an array, but this may start to hurt our response quality. The best way to know, once again, is through testing.
#### Example wrap-up
Let's review the optimizations we implemented for the customer service bot example:
![Assistants object architecture diagram](https://cdn.openai.com/API/docs/images/diagram-latency-customer-service-11b.png)
1. **Combined** query contextualization and retrieval check steps to [make fewer requests](/docs/guides/latency-optimization/4-make-fewer-requests).
2. For the new prompt, **switched to a smaller, fine-tuned GPT-3.5** to [process tokens faster](/docs/guides/latency-optimization/1-process-tokens-faster).
3. Split the assistant prompt in two, **switching to a smaller, fine-tuned GPT-3.5** for the reasoning, again to [process tokens faster](/docs/guides/latency-optimization/1-process-tokens-faster).
4. [Parallelized](/docs/guides/latency-optimization/5-parallelize) the retrieval checks and the reasoning steps.
5. **Shortened reasoning field names** and moved comments into the prompt, to [generate fewer tokens](/docs/guides/latency-optimization/2-generate-fewer-tokens).
## Conclusion
You should now be familiar with the core set of principles you can use to improve latency in your LLM application. As you explore these techniques, always remember to measure where your latency is coming from, and test the impact of each solution your try. Now go make your application _fly!_

@ -0,0 +1,198 @@
# Libraries
## Python library
We provide a [Python library](https://github.com/openai/openai-python), which you can install by running:
```bash
pip install openai
```
Once installed, you can use the library and your secret key to run the following:
```python
from openai import OpenAI
client = OpenAI(
# Defaults to os.environ.get("OPENAI_API_KEY")
)
chat_completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello world"}]
)
```
The bindings also will install a command-line utility you can use as follows:
```bash
$ openai api chat_completions.create -m gpt-3.5-turbo -g user "Hello world"
```
---
## TypeScript / JavaScript library
We provide a [TypeScript / JavaScript library](https://github.com/openai/openai-node) with support for Node.js and various [other runtimes](https://deno.land/x/openai). Install it by running:
```bash
npm install --save openai
# or
yarn add openai
```
Once installed, you can use the library and your secret key to run the following:
```javascript
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const chatCompletion = await openai.chat.completions.create({
messages: [{ role: "user", content: "Say this is a test" }],
model: "gpt-3.5-turbo",
});
```
---
.NET library Beta
We provide a [.NET library](https://github.com/openai/openai-dotnet), which you can install by running:
```
dotnet add package OpenAI --prerelease
```
Once installed, you can use the library and your secret key to run the following:
```csharp
using OpenAI.Chat;
ChatClient client = new("gpt-3.5-turbo", Environment.GetEnvironmentVariable("OPENAI_API_KEY"));
ChatCompletion chatCompletion = client.CompleteChat(
[
new UserChatMessage("Say 'this is a test.'"),
]);
```
---
## Azure OpenAI libraries
Microsoft's Azure team maintains libraries that are compatible with both the OpenAI API and Azure OpenAI services. Read the library documentation below to learn how you can use them with the OpenAI API.
- [Azure OpenAI client library for .NET](https://github.com/Azure/azure-sdk-for-net/tree/main/sdk/openai/Azure.AI.OpenAI)
- [Azure OpenAI client library for JavaScript](https://github.com/Azure/azure-sdk-for-js/tree/main/sdk/openai/openai)
- [Azure OpenAI client library for Java](https://github.com/Azure/azure-sdk-for-java/tree/main/sdk/openai/azure-ai-openai)
- [Azure OpenAI client library for Go](https://github.com/Azure/azure-sdk-for-go/tree/main/sdk/ai/azopenai)
---
## Community libraries
The libraries below are built and maintained by the broader developer community. If you'd like to add a new library here, please follow the instructions in our [help center article](https://help.openai.com/en/articles/6684216-adding-your-api-client-to-the-community-libraries-page) on adding community libraries. You can also [watch our OpenAPI specification](https://github.com/openai/openai-openapi) repository on GitHub to get timely updates on when we make changes to our API.
Please note that OpenAI does not verify the correctness or security of these projects. **Use them at your own risk!**
### C# / .NET
- [Betalgo.OpenAI](https://github.com/betalgo/openai) by [Betalgo](https://github.com/betalgo)
- [OpenAI-API-dotnet](https://github.com/OkGoDoIt/OpenAI-API-dotnet) by [OkGoDoIt](https://github.com/OkGoDoIt)
- [OpenAI-DotNet](https://github.com/RageAgainstThePixel/OpenAI-DotNet) by [RageAgainstThePixel](https://github.com/RageAgainstThePixel)
### C++
- [liboai](https://github.com/D7EAD/liboai) by [D7EAD](https://github.com/D7EAD)
### Clojure
- [openai-clojure](https://github.com/wkok/openai-clojure) by [wkok](https://github.com/wkok)
### Crystal
- [openai-crystal](https://github.com/sferik/openai-crystal) by [sferik](https://github.com/sferik)
### Dart/Flutter
- [openai](https://github.com/anasfik/openai) by [anasfik](https://github.com/anasfik)
### Delphi
- [DelphiOpenAI](https://github.com/HemulGM/DelphiOpenAI) by [HemulGM](https://github.com/HemulGM)
### Elixir
- [openai.ex](https://github.com/mgallo/openai.ex) by [mgallo](https://github.com/mgallo)
### Go
- [go-gpt3](https://github.com/sashabaranov/go-gpt3) by [sashabaranov](https://github.com/sashabaranov)
### Java
- [openai-java](https://github.com/TheoKanning/openai-java) by [Theo Kanning](https://github.com/TheoKanning)
### Julia
- [OpenAI.jl](https://github.com/rory-linehan/OpenAI.jl) by [rory-linehan](https://github.com/rory-linehan)
### Kotlin
- [openai-kotlin](https://github.com/Aallam/openai-kotlin) by [Mouaad Aallam](https://github.com/Aallam)
### Node.js
- [openai-api](https://www.npmjs.com/package/openai-api) by [Njerschow](https://github.com/Njerschow)
- [openai-api-node](https://www.npmjs.com/package/openai-api-node) by [erlapso](https://github.com/erlapso)
- [gpt-x](https://www.npmjs.com/package/gpt-x) by [ceifa](https://github.com/ceifa)
- [gpt3](https://www.npmjs.com/package/gpt3) by [poteat](https://github.com/poteat)
- [gpts](https://www.npmjs.com/package/gpts) by [thencc](https://github.com/thencc)
- [@dalenguyen/openai](https://www.npmjs.com/package/@dalenguyen/openai) by [dalenguyen](https://github.com/dalenguyen)
- [tectalic/openai](https://github.com/tectalichq/public-openai-client-js) by [tectalic](https://tectalic.com/)
### PHP
- [orhanerday/open-ai](https://packagist.org/packages/orhanerday/open-ai) by [orhanerday](https://github.com/orhanerday)
- [tectalic/openai](https://github.com/tectalichq/public-openai-client-php) by [tectalic](https://tectalic.com/)
- [openai-php client](https://github.com/openai-php/client) by [openai-php](https://github.com/openai-php)
### Python
- [chronology](https://github.com/OthersideAI/chronology) by [OthersideAI](https://www.othersideai.com/)
### R
- [rgpt3](https://github.com/ben-aaron188/rgpt3) by [ben-aaron188](https://github.com/ben-aaron188)
### Ruby
- [openai](https://github.com/nileshtrivedi/openai/) by [nileshtrivedi](https://github.com/nileshtrivedi)
- [ruby-openai](https://github.com/alexrudall/ruby-openai) by [alexrudall](https://github.com/alexrudall)
### Rust
- [async-openai](https://github.com/64bit/async-openai) by [64bit](https://github.com/64bit)
- [fieri](https://github.com/lbkolev/fieri) by [lbkolev](https://github.com/lbkolev)
### Scala
- [openai-scala-client](https://github.com/cequence-io/openai-scala-client) by [cequence-io](https://github.com/cequence-io)
### Swift
- [OpenAIKit](https://github.com/dylanshine/openai-kit) by [dylanshine](https://github.com/dylanshine)
- [OpenAI](https://github.com/MacPaw/OpenAI/) by [MacPaw](https://github.com/MacPaw)
### Unity
- [OpenAi-Api-Unity](https://github.com/hexthedev/OpenAi-Api-Unity) by [hexthedev](https://github.com/hexthedev)
- [com.openai.unity](https://github.com/RageAgainstThePixel/com.openai.unity) by [RageAgainstThePixel](https://github.com/RageAgainstThePixel)
### Unreal Engine
- [OpenAI-Api-Unreal](https://github.com/KellanM/OpenAI-Api-Unreal) by [KellanM](https://github.com/KellanM)

@ -0,0 +1,261 @@
# Creating an automated meeting minutes generator with Whisper and GPT-4
In this tutorial, we'll harness the power of OpenAI's Whisper and GPT-4 models to develop an automated meeting minutes generator. The application transcribes audio from a meeting, provides a summary of the discussion, extracts key points and action items, and performs a sentiment analysis.
## Getting started
This tutorial assumes a basic understanding of Python and an [OpenAI API key](/account/api-keys). You can use the audio file provided with this tutorial or your own.
Additionally, you will need to install the [python-docx](https://python-docx.readthedocs.io/en/latest/) and [OpenAI](/docs/libraries/libraries) libraries. You can create a new Python environment and install the required packages with the following commands:
```bash
python -m venv env
source env/bin/activate
pip install openai
pip install python-docx
```
## Transcribing audio with Whisper
<Image
png="https://cdn.openai.com/API/docs/images/tutorials/meeting-minutes/waveform3.png"
webp="https://cdn.openai.com/API/docs/images/tutorials/meeting-minutes/waveform3.webp"
alt="Audio Waveform created by DALL·E"
width="1024"
height="1024"
/>
The first step in transcribing the audio from a meeting is to pass the audio
file of the meeting into our{" "}
/v1/audio API. Whisper, the model that
powers the audio API, is capable of converting spoken language into written
text. To start, we will avoid passing a{" "}
prompt
{" "}
or{" "}
temperature
{" "}
(optional parameters to control the model's output) and stick with the default
values.
<Button
size="medium"
color={ButtonColor.neutral}
href="https://cdn.openai.com/API/docs/images/tutorials/meeting-minutes/EarningsCall.wav"
target="_blank"
>
Download sample audio
Next, we import the required packages and define a function that uses the Whisper model to take in the audio file and
transcribe it:
```python
from openai import OpenAI
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
# api_key="My API Key",
)
from docx import Document
def transcribe_audio(audio_file_path):
with open(audio_file_path, 'rb') as audio_file:
transcription = client.audio.transcriptions.create("whisper-1", audio_file)
return transcription['text']
```
In this function, `audio_file_path` is the path to the audio file you want to transcribe. The function opens this file and passes it to the Whisper ASR model (`whisper-1`) for transcription. The result is returned as raw text. Its important to note that the `openai.Audio.transcribe` function requires the actual audio file to be passed in, not just the path to the file locally or on a remote server. This means that if you are running this code on a server where you might not also be storing your audio files, you will need to have a preprocessing step that first downloads the audio files onto that device.
## Summarizing and analyzing the transcript with GPT-4
Having obtained the transcript, we now pass it to GPT-4 via the [Chat Completions API](/docs/api-reference/chat/create). GPT-4 is OpenAI's state-of-the-art large language model which we'll use to generate a summary, extract key points, action items, and perform sentiment analysis.
This tutorial uses distinct functions for each task we want GPT-4 to perform. This is not the most efficient way to do this task - you can put these instructions into one function, however, splitting them up can lead to higher quality summarization.
To split the tasks up, we define the `meeting_minutes` function which will serve as the main function of this application:
```python
def meeting_minutes(transcription):
abstract_summary = abstract_summary_extraction(transcription)
key_points = key_points_extraction(transcription)
action_items = action_item_extraction(transcription)
sentiment = sentiment_analysis(transcription)
return {
'abstract_summary': abstract_summary,
'key_points': key_points,
'action_items': action_items,
'sentiment': sentiment
}
```
In this function, `transcription` is the text we obtained from Whisper. The transcription can be passed to the four other functions, each designed to perform a specific task: `abstract_summary_extraction` generates a summary of the meeting, `key_points_extraction` extracts the main points, `action_item_extraction` identifies the action items, and `sentiment_analysis performs` a sentiment analysis. If there are other capabilities you want, you can add those in as well using the same framework shown above.
Here is how each of these functions works:
### Summary extraction
The `abstract_summary_extraction` function takes the transcription and summarizes it into a concise abstract paragraph with the aim to retain the most important points while avoiding unnecessary details or tangential points. The main mechanism to enable this process is the system message as shown below. There are many different possible ways of achieving similar results through the process commonly referred to as prompt engineering. You can read our [prompt engineering guide](/docs/guides/prompt-engineering) which gives in depth advice on how to do this most effectively.
```python
def abstract_summary_extraction(transcription):
response = client.chat.completions.create(
model="gpt-4",
temperature=0,
messages=[
{
"role": "system",
"content": "You are a highly skilled AI trained in language comprehension and summarization. I would like you to read the following text and summarize it into a concise abstract paragraph. Aim to retain the most important points, providing a coherent and readable summary that could help a person understand the main points of the discussion without needing to read the entire text. Please avoid unnecessary details or tangential points."
},
{
"role": "user",
"content": transcription
}
]
)
return completion.choices[0].message.content
```
### Key points extraction
The `key_points_extraction` function identifies and lists the main points discussed in the meeting. These points should represent the most important ideas, findings, or topics crucial to the essence of the discussion. Again, the main mechanism for controlling the way these points are identified is the system message. You might want to give some additional context here around the way your project or company runs such as “We are a company that sells race cars to consumers. We do XYZ with the goal of XYZ”. This additional context could dramatically improve the models ability to extract information that is relevant.
```python
def key_points_extraction(transcription):
response = client.chat.completions.create(
model="gpt-4",
temperature=0,
messages=[
{
"role": "system",
"content": "You are a proficient AI with a specialty in distilling information into key points. Based on the following text, identify and list the main points that were discussed or brought up. These should be the most important ideas, findings, or topics that are crucial to the essence of the discussion. Your goal is to provide a list that someone could read to quickly understand what was talked about."
},
{
"role": "user",
"content": transcription
}
]
)
return completion.choices[0].message.content
```
### Action item extraction
The `action_item_extraction` function identifies tasks, assignments, or actions agreed upon or mentioned during the meeting. These could be tasks assigned to specific individuals or general actions the group decided to take. While not covered in this tutorial, the Chat Completions API provides a [function calling capability](/docs/guides/function-calling) which would allow you to build in the ability to automatically create tasks in your task management software and assign it to the relevant person.
```python
def action_item_extraction(transcription):
response = client.chat.completions.create(
model="gpt-4",
temperature=0,
messages=[
{
"role": "system",
"content": "You are an AI expert in analyzing conversations and extracting action items. Please review the text and identify any tasks, assignments, or actions that were agreed upon or mentioned as needing to be done. These could be tasks assigned to specific individuals, or general actions that the group has decided to take. Please list these action items clearly and concisely."
},
{
"role": "user",
"content": transcription
}
]
)
return completion.choices[0].message.content
```
### Sentiment analysis
The `sentiment_analysis` function analyzes the overall sentiment of the discussion. It considers the tone, the emotions conveyed by the language used, and the context in which words and phrases are used. For tasks which are less complicated, it may also be worthwhile to try out `gpt-3.5-turbo` in addition to `gpt-4` to see if you can get a similar level of performance. It might also be useful to experiment with taking the results of the `sentiment_analysis` function and passing it to the other functions to see how having the sentiment of the conversation impacts the other attributes.
```python
def sentiment_analysis(transcription):
response = client.chat.completions.create(
model="gpt-4",
temperature=0,
messages=[
{
"role": "system",
"content": "As an AI with expertise in language and emotion analysis, your task is to analyze the sentiment of the following text. Please consider the overall tone of the discussion, the emotion conveyed by the language used, and the context in which words and phrases are used. Indicate whether the sentiment is generally positive, negative, or neutral, and provide brief explanations for your analysis where possible."
},
{
"role": "user",
"content": transcription
}
]
)
return completion.choices[0].message.content
```
## Exporting meeting minutes
<Image
png="https://cdn.openai.com/API/docs/images/tutorials/meeting-minutes/waveform4.png"
webp="https://cdn.openai.com/API/docs/images/tutorials/meeting-minutes/waveform4.webp"
alt="Audio Waveform created by DALL·E"
width="1024"
height="1024"
/>
Once we've generated the meeting minutes, it's beneficial to save them into a
readable format that can be easily distributed. One common format for such
reports is Microsoft Word. The Python docx library is a popular open source
library for creating Word documents. If you wanted to build an end-to-end
meeting minute application, you might consider removing this export step in
favor of sending the summary inline as an email followup.
To handle the exporting process, define a function `save_as_docx` that converts the raw text to a Word document:
```python
def save_as_docx(minutes, filename):
doc = Document()
for key, value in minutes.items():
# Replace underscores with spaces and capitalize each word for the heading
heading = ' '.join(word.capitalize() for word in key.split('_'))
doc.add_heading(heading, level=1)
doc.add_paragraph(value)
# Add a line break between sections
doc.add_paragraph()
doc.save(filename)
```
In this function, minutes is a dictionary containing the abstract summary, key points, action items, and sentiment analysis from the meeting. Filename is the name of the Word document file to be created. The function creates a new Word document, adds headings and content for each part of the minutes, and then saves the document to the current working directory.
Finally, you can put it all together and generate the meeting minutes from an audio file:
```python
audio_file_path = "Earningscall.wav"
transcription = transcribe_audio(audio_file_path)
minutes = meeting_minutes(transcription)
print(minutes)
save_as_docx(minutes, 'meeting_minutes.docx')
```
This code will transcribe the audio file `Earningscall.wav`, generates the meeting minutes, prints them, and then saves them into a Word document called `meeting_minutes.docx`.
Now that you have the basic meeting minutes processing setup, consider trying to optimize the performance with [prompt engineering](/docs/guides/prompt-engineering) or build an end-to-end system with native [function calling](/docs/guides/function-calling).

@ -0,0 +1,332 @@
Migration Guide Beta
We have changed the way that tools and files work in the Assistants API between the `v1` and `v2` versions of the beta. Both versions of the beta continue to be accessible via the API today, but we recommend migrating to the newest version of our APIs as soon as feasible. We will deprecate `v1` of the beta by the end of 2024.
If you do not use tools or files with the Assistants API today, there should be no
changes required for you to migrate from the `v1` version to the `v2` version of the
beta. Simply pass the{" "}
`v2` beta version header
{" "}
and/or move to the latest version of our Node and Python SDKs!
## What has changed
The `v2` version of the Assistants API contains the following changes:
1. **Tool rename:** The `retrieval` tool has been renamed to the `file_search` tool
2. **Files belong to tools:** Files are now associated with tools instead of Assistants and Messages. This means that:
- `AssistantFile` and `MessageFile` objects no longer exist.
- Instead of `AssistantFile` and `MessageFile`, files are attached to Assistants and **Threads** using the new `tool_resources` object.
- The `tool_resources` for the code interpreter tool are a list of `file_ids`.
- The `tool_resources` for the `file_search` tool are a new object called a `vector_stores`.
- Messages now have an `attachments`, rather than a `file_ids` parameter. Message attachments are helpers that add the files to a Threads `tool_resources`.
<CodeComparison
snippets={[
{
language: "json",
code: `
{
"id": "asst_abc123",
"object": "assistant",
"created_at": 1698984975,
"name": "Math Tutor",
"description": null,
"model": "gpt-4-turbo",
"instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.",
"tools": [{ "type": "code_interpreter" }],
"file_ids": [],
"metadata": {}
}
`,
title: "V1 Assistant",
},
{
language: "json",
code: `
{
"id": "asst_abc123",
"object": "assistant",
"created_at": 1698984975,
"name": "Math Tutor",
"description": null,
"model": "gpt-4-turbo",
"instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.",
"tools": [
{
"type": "code_interpreter"
},
{
"type": "file_search"
}
],
"tool_resources": {
"file_search": {
"vector_store_ids": ["vs_abc"]
},
"code_interpreter": {
"file_ids": ["file-123", "file-456"]
}
}
}
`,
title: "V2 Assistant",
},
]}
/>
Assistants have `tools` and `tool_resources` instead of `file_ids`. The `retrieval` tool is now the `file_search` tool. The `tool_resource` for the `file_search` tool is a `vector_store`.
<CodeComparison
snippets={[
{
language: "json",
code: `
{
"id": "thread_abc123",
"object": "thread",
"created_at": 1699012949,
"metadata": {}
}
`,
title: "V1 Thread",
},
{
language: "json",
code: `
{
"id": "thread_abc123",
"object": "thread",
"created_at": 1699012949,
"metadata": {},
"tools": [
{
"type": "file_search"
},
{
"type": "code_interpreter"
}
],
"tool_resources": {
"file_search": {
"vector_store_ids": ["vs_abc"]
},
"code_interpreter": {
"file_ids": ["file-123", "file-456"]
}
}
}
`,
title: "V2 Thread",
},
]}
/>
Threads can bring their own `tool_resources` into a conversation.
<CodeComparison
snippets={[
{
title: "V1 Message",
language: "json",
code: `
{
"id": "msg_abc123",
"object": "thread.message",
"created_at": 1698983503,
"thread_id": "thread_abc123",
"role": "assistant",
"content": [
{
"type": "text",
"text": {
"value": "Hi! How can I help you today?",
"annotations": []
}
}
],
"assistant_id": "asst_abc123",
"run_id": "run_abc123",
"metadata": {},
"file_ids": []
}
`,
},
{
title: "V2 Message",
language: "json",
code: `
{
"id": "msg_abc123",
"object": "thread.message",
"created_at": 1698983503,
"thread_id": "thread_abc123",
"role": "assistant",
"content": [
{
"type": "text",
"text": {
"value": "Hi! How can I help you today?",
"annotations": []
}
}
],
"assistant_id": "asst_abc123",
"run_id": "run_abc123",
"metadata": {},
"attachments": [
{
"file_id": "file-123",
"tools": [
{ "type": "file_search" },
{ "type": "code_interpreter" }
]
}
]
}
`,
},
]}
/>
Messages have `attachments` instead of `file_ids`. `attachments` are helpers that add files to the Threads `tool_resources`.
All `v1` endpoints and objects for the Assistants API can be found under the [Legacy](/docs/api-reference/assistants-v1) section of the API reference.
## Accessing v1 data in v2
To make your migration simple between our `v1` and `v2` APIs, we automatically map `AssistantFiles` and `MessageFiles` to the appropriate `tool_resources` based on the tools that are enabled in Assistants or Runs these files are a part of.
| | `v1` version | `v2` version |
| :------------------------------------ | :---------------------- | :------------------------------------------------------------------------------ |
| AssistantFiles for `code_interpreter` | `file_ids` on Assistant | Files in an Assistants `tool_resources.code_interpreter` |
| AssistantFiles for `retrieval` | `file_ids` on Assistant | Files in a vector_store attached to an Assistant (`tool_resources.file_search`) |
| MessageFiles for `code_interpreter` | `file_ids` on Message | Files in an Threads `tool_resources.code_interpreter` |
| MessageFiles for `retrieval` | `file_ids` on Message | Files in a vector_store attached to a Thread (`tool_resources.file_search`) |
It's important to note that while file_ids from{" "}
v1 are mapped to{" "}
tool_resources
in v2, the inverse is not true. Changes you make
to tool_resources in
v2
will not be reflected as file_ids in <span
className={"inline-code"}
>
v1
.
Because Assistant Files and Message Files are already mapped to the appropriate `tool_resources` in `v2`, when youre ready to migrate to `v2` you shouldn't have to worry about a data migration. Instead, you only need to:
1. Update your integration to reflect the new API and objects. You may need to do things like:
- Migrate to creating `vector_stores` and using `file_search`, if you were using the `retrieval` tool. Importantly, since these operations are asynchronous, youll want to ensure files are [successfully ingested](/docs/assistants/tools/file-search/ensure-readiness-before-creating-runs) by the `vector_stores` before creating run.
- Migrate to adding files to `tool_resources.code_interpreter` instead of an Assistant or Messages files, if you were using the `code_interpreter` tool.
- Migrate to using Message `attachments` instead of `file_ids`.
2. Upgrade to the latest version of our SDKs
## Changing beta versions
#### Without SDKs
Both beta versions can be accessed by passing the right API version header in your API requests:
1. `v1`: `OpenAI-Beta: assistants=v1`
2. `v2`: `OpenAI-Beta: assistants=v2`
<CodeSample
defaultLanguage="v2"
code={{
v1: `
curl "https://api.openai.com/v1/assistants" \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "OpenAI-Beta: assistants=v1" \\
-d '{
"instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.",
"name": "Math Tutor",
"tools": [{"type": "code_interpreter"}],
"model": "gpt-4-turbo"
}'
`.trim(),
v2: `
curl "https://api.openai.com/v1/assistants" \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"instructions": "You are a personal math tutor. When asked a question, write and run Python code to answer the question.",
"name": "Math Tutor",
"tools": [{"type": "code_interpreter"}],
"model": "gpt-4-turbo"
}'
`.trim(),
}}
/>
### With SDKs
Versions of our SDKs that are released after the release of the **`v2`** beta will have the **`openai.beta`** namespace
point to the **`v2`** version of the API by default. You can still access the **`v1`** version of the API by using an older
version of the SDK (1.20.0 or earlier for python, 4.36.0 or earlier for node) or by overriding the version header.
To install an older version of the SDK, you can use the following commands:
<CodeSample
title="Installing older versions of the SDK"
defaultLanguage="python"
code={{
python: `
pip install openai==1.20.0
`.trim(),
"node.js": `
npm install openai@4.36.0
`.trim(),
}}
/>
You can also override this header in a newer SDK version, but we don't recommend this approach since the object types in these newer SDK versions will be different from the `v1` objects.
<CodeSample
title="Accessing the `v1` API version in new SDKs"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI\n
client = OpenAI(default_headers={"OpenAI-Beta": "assistants=v1"})
`.trim(),
"node.js": `
import OpenAI from "openai";\n
const openai = new OpenAI({ defaultHeaders: {"OpenAI-Beta": "assistants=v1"} });
`.trim(),
}}
/>
## Billing
All [vector stores](/docs/api-reference/vector-stores/object) created before the release of the `v2` API (April 17, 2024) will be free to use until the end of 2024. This implies that any vector stores that were created as a result of us mapping your `v1` data to `v2`, before the `v2` launch will be free. After the end of 2024, theyll be billed at whatever the fees for vector stores are at that point. See our [pricing page](https://openai.com/pricing) for the latest pricing information.
Any vector store that is created before the release of the `v2` API (April 17, 2024) but not used in a single Run between that release date and the end of 2024 will be deleted. This is to avoid us starting to bill you for something you created during the beta but never used.
Vector stores created after the release of the `v2` API will be billed at current rates as specified on the [pricing page](https://openai.com/pricing).
## Deleting files
Deleting Assistant Files / Message Files via the `v1` API also removes them from the `v2` API. However, the inverse is not true - deletions in the `v2` version of the API do not propogate to `v1`. If you created a file on `v1` and would like to "fully" delete a file from your account on both `v1` and `v2` you should:
- delete Assistant Files / Message Files you create using `v1` APIs using the `v1` endpoints, or
- delete the underlying [file object](/docs/api-reference/files/delete) — this ensures it is fully removed from all objects in all versions of the API.
## Playground
The default playground experience has been migrated to use the `v2` version of the API (you will still have a read-only view of the `v1` version of objects, but will not be able to edit them). Any changes you make to tools and files via the Playground will only be accessible in the `v2` version of the API.
In order to make changes to files in the `v1` version of the API, you will need to use the API directly.

@ -0,0 +1,228 @@
# Models
## Flagship models
## Models overview
The OpenAI API is powered by a diverse set of models with different capabilities and price points. You can also make customizations to our models for your specific use case with [fine-tuning](/docs/guides/fine-tuning).
| Model | Description |
| ----------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
| [GPT-4o](/docs/models/gpt-4o) | The fastest and most affordable flagship model |
| [GPT-4 Turbo and GPT-4](/docs/models/gpt-4-turbo-and-gpt-4) | The previous set of high-intelligence models |
| [GPT-3.5 Turbo](/docs/models/gpt-3-5-turbo) | A fast, inexpensive model for simple tasks |
| [DALL·E](/docs/models/dall-e) | A model that can generate and edit images given a natural language prompt |
| [TTS](/docs/models/tts) | A set of models that can convert text into natural sounding spoken audio |
| [Whisper](/docs/models/whisper) | A model that can convert audio into text |
| [Embeddings](/docs/models/embeddings) | A set of models that can convert text into a numerical form |
| [Moderation](/docs/models/moderation) | A fine-tuned model that can detect whether text may be sensitive or unsafe |
| [GPT base](/docs/models/gpt-base) | A set of models without instruction following that can understand as well as generate natural language or code |
| [Deprecated](/docs/deprecations) | A full list of models that have been deprecated along with the suggested replacement |
We have also published open source models including [Point-E](https://github.com/openai/point-e), [Whisper](https://github.com/openai/whisper), [Jukebox](https://github.com/openai/jukebox), and [CLIP](https://github.com/openai/CLIP).
## Continuous model upgrades
`gpt-4o`, `gpt-4-turbo`, `gpt-4`, and `gpt-3.5-turbo` point to their respective latest model version. You can verify this by looking at the [response object](/docs/api-reference/chat/object) after sending a request. The response will include the specific model version used (e.g. `gpt-3.5-turbo-1106`).
We also offer pinned model versions that developers can continue using for at least three months after an updated model has been introduced. With the new cadence of model updates, we are also giving people the ability to contribute evals to help us improve the model for different use cases. If you are interested, check out the [OpenAI Evals](https://github.com/openai/evals) repository.
Learn more about model deprecation on our [deprecation page](/docs/deprecations).
## GPT-4o
GPT-4o (“o” for “omni”) is our most advanced model. It is multimodal (accepting text or image inputs and outputting text), and it has the same high intelligence as GPT-4 Turbo but is much more efficient—it generates text 2x faster and is 50% cheaper. Additionally, GPT-4o has the best vision and performance across non-English languages of any of our models. GPT-4o is available in the OpenAI API to paying customers. Learn how to use GPT-4o in our [text generation guide](/docs/guides/text-generation).
| Model | Description | Context window | Training data |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -------------- | -------------- |
| gpt-4o | New **GPT-4o** Our most advanced, multimodal flagship model thats cheaper and faster than GPT-4 Turbo. Currently points to `gpt-4o-2024-05-13`. | 128,000 tokens | Up to Oct 2023 |
| gpt-4o-2024-05-13 | `gpt-4o` currently points to this version. | 128,000 tokens | Up to Oct 2023 |
## GPT-4 Turbo and GPT-4
GPT-4 is a large multimodal model (accepting text or image inputs and outputting text) that can solve difficult problems with greater accuracy than any of our previous models, thanks to its broader general knowledge and advanced reasoning capabilities. GPT-4 is available in the OpenAI API to [paying customers](https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4). Like `gpt-3.5-turbo`, GPT-4 is optimized for chat but works well for traditional completions tasks using the [Chat Completions API](/docs/api-reference/chat). Learn how to use GPT-4 in our [text generation guide](/docs/guides/text-generation).
| Model | Description | Context window | Training data |
| ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | -------------- |
| gpt-4-turbo | The latest GPT-4 Turbo model with vision capabilities. Vision requests can now use JSON mode and function calling. Currently points to `gpt-4-turbo-2024-04-09`. | 128,000 tokens | Up to Dec 2023 |
| gpt-4-turbo-2024-04-09 | GPT-4 Turbo with Vision model. Vision requests can now use JSON mode and function calling. `gpt-4-turbo` currently points to this version. | 128,000 tokens | Up to Dec 2023 |
| gpt-4-turbo-preview | GPT-4 Turbo preview model. Currently points to `gpt-4-0125-preview`. | 128,000 tokens | Up to Dec 2023 |
| gpt-4-0125-preview | GPT-4 Turbo preview model intended to reduce cases of “laziness” where the model doesnt complete a task. Returns a maximum of 4,096 output tokens. [Learn more](https://openai.com/blog/new-embedding-models-and-api-updates). | 128,000 tokens | Up to Dec 2023 |
| gpt-4-1106-preview | GPT-4 Turbo preview model featuring improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. This is a preview model. [Learn more](https://openai.com/blog/new-models-and-developer-products-announced-at-devday). | 128,000 tokens | Up to Apr 2023 |
| gpt-4 | Currently points to `gpt-4-0613`. See [continuous model upgrades](/docs/models/continuous-model-upgrades). | 8,192 tokens | Up to Sep 2021 |
| gpt-4-0613 | Snapshot of `gpt-4` from June 13th 2023 with improved function calling support. | 8,192 tokens | Up to Sep 2021 |
| gpt-4-0314 | Legacy Snapshot of `gpt-4` from March 14th 2023. | 8,192 tokens | Up to Sep 2021 |
For many basic tasks, the difference between GPT-4 and GPT-3.5 models is not significant. However, in more complex reasoning situations, GPT-4 is much more capable than any of our previous models.
#### Multilingual capabilities
GPT-4 [outperforms both previous large language models](https://cdn.openai.com/papers/gpt-4.pdf) and as of 2023, most state-of-the-art systems (which often have benchmark-specific training or hand-engineering). On the MMLU benchmark, an English-language suite of multiple-choice questions covering 57 subjects, GPT-4 not only outperforms existing models by a considerable margin in English, but also demonstrates strong performance in other languages.
## GPT-3.5 Turbo
GPT-3.5 Turbo models can understand and generate natural language or code and have been optimized for chat using the [Chat Completions API](/docs/api-reference/chat) but work well for non-chat tasks as well.
| Model | Description | Context window | Training data |
| ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------- | -------------- |
| gpt-3.5-turbo-0125 | The latest GPT-3.5 Turbo model with higher accuracy at responding in requested formats and a fix for a bug which caused a text encoding issue for non-English language function calls. Returns a maximum of 4,096 output tokens. [Learn more](https://openai.com/blog/new-embedding-models-and-api-updates#:~:text=Other%20new%20models%20and%20lower%20pricing). | 16,385 tokens | Up to Sep 2021 |
| gpt-3.5-turbo | Currently points to `gpt-3.5-turbo-0125`. | 16,385 tokens | Up to Sep 2021 |
| gpt-3.5-turbo-1106 | GPT-3.5 Turbo model with improved instruction following, JSON mode, reproducible outputs, parallel function calling, and more. Returns a maximum of 4,096 output tokens. [Learn more](https://openai.com/blog/new-models-and-developer-products-announced-at-devday). | 16,385 tokens | Up to Sep 2021 |
| gpt-3.5-turbo-instruct | Similar capabilities as GPT-3 era models. Compatible with legacy Completions endpoint and not Chat Completions. | 4,096 tokens | Up to Sep 2021 |
## DALL·E
DALL·E is a AI system that can create realistic images and art from a description in natural language. DALL·E 3 currently supports the ability, given a prompt, to create a new image with a specific size. DALL·E 2 also support the ability to edit an existing image, or create variations of a user provided image.
[DALL·E 3](https://openai.com/dall-e-3) is available through our [Images API](/docs/guides/images/introduction) along with [DALL·E 2](https://openai.com/blog/dall-e-api-now-available-in-public-beta). You can try DALL·E 3 through [ChatGPT Plus](https://chatgpt.com).
| Model | Description |
| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `dall-e-3` | The latest DALL·E model released in Nov 2023. [Learn more](https://openai.com/blog/new-models-and-developer-products-announced-at-devday). |
| `dall-e-2` | The previous DALL·E model released in Nov 2022. The 2nd iteration of DALL·E with more realistic, accurate, and 4x greater resolution images than the original model. |
## TTS
TTS is an AI model that converts text to natural sounding spoken text. We offer two different model variates, `tts-1` is optimized for real time text to speech use cases and `tts-1-hd` is optimized for quality. These models can be used with the [Speech endpoint in the Audio API](/docs/guides/text-to-speech).
| Model | Description |
| ---------- | ------------------------------------------------------- |
| `tts-1` | The latest text to speech model, optimized for speed. |
| `tts-1-hd` | The latest text to speech model, optimized for quality. |
## Whisper
Whisper is a general-purpose speech recognition model. It is trained on a large dataset of diverse audio and is also a multi-task model that can perform multilingual speech recognition as well as speech translation and language identification. The Whisper v2-large model is currently available through our API with the `whisper-1` model name.
Currently, there is no difference between the [open source version of Whisper](https://github.com/openai/whisper) and the version available through our API. However, [through our API](/docs/guides/speech-to-text), we offer an optimized inference process which makes running Whisper through our API much faster than doing it through other means. For more technical details on Whisper, you can [read the paper](https://arxiv.org/abs/2212.04356).
## Embeddings
Embeddings are a numerical representation of text that can be used to measure the relatedness between two pieces of text. Embeddings are useful for search, clustering, recommendations, anomaly detection, and classification tasks. You can read more about our latest embedding models in the [announcement blog post](https://openai.com/blog/new-embedding-models-and-api-updates).
| Model | Description | Output Dimension |
| ------------------------ | --------------------------------------------------------------------------------- | ---------------- |
| `text-embedding-3-large` | Most capable embedding model for both english and non-english tasks | 3,072 |
| `text-embedding-3-small` | Increased performance over 2nd generation ada embedding model | 1,536 |
| `text-embedding-ada-002` | Most capable 2nd generation embedding model, replacing 16 first generation models | 1,536 |
---
## Moderation
The Moderation models are designed to check whether content complies with OpenAI's [usage policies](https://openai.com/policies/usage-policies). The models provide classification capabilities that look for content in the following categories: hate, hate/threatening, self-harm, sexual, sexual/minors, violence, and violence/graphic. You can find out more in our [moderation guide](/docs/guides/moderation/overview).
Moderation models take in an arbitrary sized input that is automatically broken up into chunks of 4,096 tokens. In cases where the input is more than 32,768 tokens, truncation is used which in a rare condition may omit a small number of tokens from the moderation check.
The final results from each request to the moderation endpoint shows the maximum value on a per category basis. For example, if one chunk of 4K tokens had a category score of 0.9901 and the other had a score of 0.1901, the results would show 0.9901 in the API response since it is higher.
| Model | Description | Max tokens |
| ------------------------ | ---------------------------------------------------- | ---------- |
| `text-moderation-latest` | Currently points to `text-moderation-007`. | 32,768 |
| `text-moderation-stable` | Currently points to `text-moderation-007`. | 32,768 |
| `text-moderation-007` | Most capable moderation model across all categories. | 32,768 |
## GPT base
GPT base models can understand and generate natural language or code but are not trained with instruction following. These models are made to be replacements for our original GPT-3 base models and use the legacy Completions API. Most customers should use GPT-3.5 or GPT-4.
| Model | Description | Max tokens | Training data |
| ------------- | ------------------------------------------------------------ | ------------- | -------------- |
| `babbage-002` | Replacement for the GPT-3 `ada` and `babbage` base models. | 16,384 tokens | Up to Sep 2021 |
| `davinci-002` | Replacement for the GPT-3 `curie` and `davinci` base models. | 16,384 tokens | Up to Sep 2021 |
## How we use your data
Your data is your data.
As of March 1, 2023, data sent to the OpenAI API will not be used to train or improve OpenAI models (unless you explicitly [opt in](https://docs.google.com/forms/d/e/1FAIpQLSevgtKyiSWIOj6CV6XWBHl1daPZSOcIWzcUYUXQ1xttjBgDpA/viewform)). One advantage to opting in is that the models may get better at your use case over time.
To help identify abuse, API data may be retained for up to 30 days, after which it will be deleted (unless otherwise required by law). For trusted customers with sensitive applications, zero data retention may be available. With zero data retention, request and response bodies are not persisted to any logging mechanism and exist only in memory in order to serve the request.
Note that this data policy does not apply to OpenAI's non-API consumer services like [ChatGPT](https://chatgpt.com/) or [DALL·E Labs](https://labs.openai.com/).
### Default usage policies by endpoint
| Endpoint | Data used for training | Default retention | Eligible for zero retention |
| -------------------------- | ---------------------- | ------------------------- | --------------------------- |
| `/v1/chat/completions`\* | No | 30 days | Yes, except image inputs\* |
| `/v1/assistants` | No | 30 days \*\* | No |
| `/v1/threads` | No | 30 days \*\* | No |
| `/v1/threads/messages` | No | 30 days \*\* | No |
| `/v1/threads/runs` | No | 30 days \*\* | No |
| `/v1/vector_stores` | No | 30 days \*\* | No |
| `/v1/threads/runs/steps` | No | 30 days \*\* | No |
| `/v1/images/generations` | No | 30 days | No |
| `/v1/images/edits` | No | 30 days | No |
| `/v1/images/variations` | No | 30 days | No |
| `/v1/embeddings` | No | 30 days | Yes |
| `/v1/audio/transcriptions` | No | Zero data retention | - |
| `/v1/audio/translations` | No | Zero data retention | - |
| `/v1/audio/speech` | No | 30 days | Yes |
| `/v1/files` | No | Until deleted by customer | No |
| `/v1/fine_tuning/jobs` | No | Until deleted by customer | No |
| `/v1/batches` | No | Until deleted by customer | No |
| `/v1/moderations` | No | Zero data retention | - |
| `/v1/completions` | No | 30 days | Yes |
\* Image inputs via the `gpt-4-turbo` model (or previously `gpt-4-vision-preview`) are not eligible for zero retention.
\*\* Objects related to the Assistants API are deleted from our servers 30 days after you delete them via the API or the dashboard. Objects that are not deleted via the API or dashboard are retained indefinitely.
For details, see our [API data usage policies](https://openai.com/policies/api-data-usage-policies). To learn more about zero retention, get in touch with our [sales team](https://openai.com/contact-sales).
## Model endpoint compatibility
| Endpoint | Latest models |
| ------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| /v1/assistants | All GPT-4 and GPT-3.5 Turbo models. The `retrieval` tool requires `gpt-4-turbo-preview` (and subsequent dated model releases) or `gpt-3.5-turbo-1106` (and subsequent versions). |
| /v1/audio/transcriptions | `whisper-1` |
| /v1/audio/translations | `whisper-1` |
| /v1/audio/speech | `tts-1`, `tts-1-hd` |
| /v1/chat/completions | `gpt-4` and dated model releases, `gpt-4-turbo-preview` and dated model releases, `gpt-3.5-turbo` and dated model releases, fine-tuned versions of `gpt-3.5-turbo` |
| /v1/completions (Legacy) | `gpt-3.5-turbo-instruct`, `babbage-002`, `davinci-002` |
| /v1/embeddings | `text-embedding-3-small`, `text-embedding-3-large`, `text-embedding-ada-002` |
| /v1/fine_tuning/jobs | `gpt-3.5-turbo`, `babbage-002`, `davinci-002` |
| /v1/moderations | `text-moderation-stable`, `text-moderation-latest` |
| /v1/images/generations | `dall-e-2`, `dall-e-3` |
This list excludes all of our [deprecated models](/docs/deprecations).

@ -0,0 +1,114 @@
# Moderation
Learn how to build moderation into your AI applications.
## Overview
The [moderations](/docs/api-reference/moderations) endpoint is a tool you can use to check whether text is potentially harmful. Developers can use it to identify content that might be harmful and take action, for instance by filtering it.
The models classifies the following categories:
| Category | Description |
| ------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `hate` | Content that expresses, incites, or promotes hate based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. Hateful content aimed at non-protected groups (e.g., chess players) is harassment. |
| `hate/threatening` | Hateful content that also includes violence or serious harm towards the targeted group based on race, gender, ethnicity, religion, nationality, sexual orientation, disability status, or caste. |
| `harassment` | Content that expresses, incites, or promotes harassing language towards any target. |
| `harassment/threatening` | Harassment content that also includes violence or serious harm towards any target. |
| `self-harm` | Content that promotes, encourages, or depicts acts of self-harm, such as suicide, cutting, and eating disorders. |
| `self-harm/intent` | Content where the speaker expresses that they are engaging or intend to engage in acts of self-harm, such as suicide, cutting, and eating disorders. |
| `self-harm/instructions` | Content that encourages performing acts of self-harm, such as suicide, cutting, and eating disorders, or that gives instructions or advice on how to commit such acts. |
| `sexual` | Content meant to arouse sexual excitement, such as the description of sexual activity, or that promotes sexual services (excluding sex education and wellness). |
| `sexual/minors` | Sexual content that includes an individual who is under 18 years old. |
| `violence` | Content that depicts death, violence, or physical injury. |
| `violence/graphic` | Content that depicts death, violence, or physical injury in graphic detail. |
The moderation endpoint is free to use for most developers. For higher accuracy, try splitting long pieces of text into smaller chunks each less than 2,000 characters.
We are continuously working to improve the accuracy of our classifier. Our support for non-English languages is currently limited.
## Quickstart
To obtain a classification for a piece of text, make a request to the [moderation endpoint](/docs/api-reference/moderations) as demonstrated in the following code snippets:
<CodeSample
title="Example: Getting moderations"
defaultLanguage="curl"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.moderations.create(input="Sample text goes here.")\n
output = response.results[0]
`.trim(),
curl: `
curl https://api.openai.com/v1/moderations \\
-X POST \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{"input": "Sample text goes here"}'
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const moderation = await openai.moderations.create({ input: "Sample text goes here." });\n
console.log(moderation);
}
main();
`.trim(),
}}
/>
Below is an example output of the endpoint. It returns the following fields:
- `flagged`: Set to `true` if the model classifies the content as potentially harmful, `false` otherwise.
- `categories`: Contains a dictionary of per-category violation flags. For each category, the value is `true` if the model flags the corresponding category as violated, `false` otherwise.
- `category_scores`: Contains a dictionary of per-category raw scores output by the model, denoting the model's confidence that the input violates the OpenAI's policy for the category. The value is between 0 and 1, where higher values denote higher confidence. The scores should not be interpreted as probabilities.
```json
{
"id": "modr-XXXXX",
"model": "text-moderation-007",
"results": [
{
"flagged": true,
"categories": {
"sexual": false,
"hate": false,
"harassment": false,
"self-harm": false,
"sexual/minors": false,
"hate/threatening": false,
"violence/graphic": false,
"self-harm/intent": false,
"self-harm/instructions": false,
"harassment/threatening": true,
"violence": true
},
"category_scores": {
"sexual": 1.2282071e-6,
"hate": 0.010696256,
"harassment": 0.29842457,
"self-harm": 1.5236925e-8,
"sexual/minors": 5.7246268e-8,
"hate/threatening": 0.0060676364,
"violence/graphic": 4.435014e-6,
"self-harm/intent": 8.098441e-10,
"self-harm/instructions": 2.8498655e-11,
"harassment/threatening": 0.63055265,
"violence": 0.99011886
}
}
]
}
```
We plan to continuously upgrade the moderation endpoint's underlying model. Therefore,
custom policies that rely on `category_scores` may need recalibration over time.

@ -0,0 +1,128 @@
Node.js is a popular JavaScript framework that is commonly used for web development. OpenAI provides a custom [Node.js / TypeScript library](https://github.com/openai/openai-node) which makes working with the OpenAI API in JavaScript simple and efficient.
## Step 1: Setting up Node
To use the OpenAI Node.js library, you will need to ensure you have Node.js installed.
To download Node.js, head to the [official Node website](https://nodejs.org/en/download) and download the most recent version marked "LTS" (Long Term Support). If you are installing Node.js for the first time, you can follow the [official Node.js usage guide](https://nodejs.org/api/synopsis.html#usage) to get started.
Once you have Node.js installed, the OpenAI Node.js library can be installed. From the terminal / command line, run:
```
npm install --save openai
# or
yarn add openai
```
## Step 2: Set up your API key
The main advantage to making your API key accessible for all projects is that our SDK will automatically detect it and use it without having to write any code.
1. **Open Terminal**: You can find it in the Applications folder or search for it using Spotlight (Command + Space).
2. **Edit bash profile**: Use the command `nano ~/.bash_profile` or `nano ~/.zshrc` (for newer MacOS versions) to open the profile file in a text editor.
3. **Add Environment Variable**: In the editor, ensure you have set your API key as shown below, replacing `your-api-key-here` with your actual API key:
```
export OPENAI_API_KEY='your-api-key-here'
```
4. **Save and exit**: Press Ctrl+O to write the changes, followed by Ctrl+X to close the editor.
5. **Load your profile**: Use the command `source ~/.bash_profile` or `source ~/.zshrc` to load the updated profile.
6. **Verification**: Verify the setup by typing `echo $OPENAI_API_KEY` in the terminal. It should display your API key.
1. **Open command prompt**: You can find it by searching "cmd" in the start menu.
2. **Set environment variable in the current session**: To set the environment variable in the current session, use the command below, replacing `your-api-key-here` with your actual API key:
```
setx OPENAI_API_KEY "your-api-key-here"
```
This command will set the OPENAI_API_KEY environment variable for the current session.
3. **Permanent setup**: To make the setup permanent, add the variable through the system properties as follows:
- Right-click on 'This PC' or 'My Computer' and select 'Properties'.
- Click on 'Advanced system settings'.
- Click the 'Environment Variables' button.
- In the 'System variables' section, click 'New...' and enter OPENAI_API_KEY as the variable name and your API key as the variable value.
4. **Verification**: To verify the setup, reopen the command prompt and type the command below. It should display your API key:
`echo %OPENAI_API_KEY%`
## Step 3: Sending your first API request
After you have Node.js configured and set up an API key, the final step is to send a request to the OpenAI API using the Node.js library. To do this, create a file named `openai-test.js` using the terminal or an IDE.
Inside the file, copy and paste one of the examples below:
<CodeSample
defaultLanguage="ChatCompletions"
code={{
ChatCompletions: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const completion = await openai.chat.completions.create({
messages: [{ role: "system", content: "You are a helpful assistant." }],
model: "gpt-3.5-turbo",
});\n
console.log(completion.choices[0]);
}\n
main();
`.trim(),
Embeddings: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const embedding = await openai.embeddings.create({
model: "text-embedding-ada-002",
input: "The quick brown fox jumped over the lazy dog",
});\n
console.log(embedding);
}\n
main();
`.trim(),
Images: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const image = await openai.images.generate({ prompt: "A cute baby sea otter" });\n
console.log(image.data);
}
main();
`.trim(),
}}
/>
To run the code, enter `node openai-test.js` into the terminal / command line.
The [Chat Completions](/docs/api-reference/chat/create) example highlights just one area of strength for our models: creative ability. Explaining recursion (the programming topic) in a well formatted poem is something both the best developers and best poets would struggle with. In this case, `gpt-3.5-turbo` does it effortlessly.

@ -0,0 +1,340 @@
# Optimizing LLMs for accuracy
### How to maximize correctness and consistent behavior when working with LLMs
Optimizing LLMs is hard.
We've worked with many developers across both start-ups and enterprises, and the reason optimization is hard consistently boils down to these reasons:
- Knowing **how to start** optimizing accuracy
- **When to use what** optimization method
- What level of accuracy is **good enough** for production
This paper gives a mental model for how to optimize LLMs for accuracy and behavior. Well explore methods like prompt engineering, retrieval-augmented generation (RAG) and fine-tuning. Well also highlight how and when to use each technique, and share a few pitfalls.
As you read through, it's important to mentally relate these principles to what accuracy means for your specific use case. This may seem obvious, but there is a difference between producing a bad copy that a human needs to fix vs. refunding a customer $1000 rather than $100. You should enter any discussion on LLM accuracy with a rough picture of how much a failure by the LLM costs you, and how much a success saves or earns you - this will be revisited at the end, where we cover how much accuracy is “good enough” for production.
## LLM optimization context
Many “how-to” guides on optimization paint it as a simple linear flow - you start with prompt engineering, then you move on to retrieval-augmented generation, then fine-tuning. However, this is often not the case - these are all levers that solve different things, and to optimize in the right direction you need to pull the right lever.
It is useful to frame LLM optimization as more of a matrix:
![Accuracy mental model diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-01.png)
The typical LLM task will start in the bottom left corner with prompt engineering, where we test, learn, and evaluate to get a baseline. Once weve reviewed those baseline examples and assessed why they are incorrect, we can pull one of our levers:
- **Context optimization:** You need to optimize for context when 1) the model lacks contextual knowledge because it wasnt in its training set, 2) its knowledge is out of date, or 3) it requires knowledge of proprietary information. This axis maximizes **response accuracy**.
- **LLM optimization:** You need to optimize the LLM when 1) the model is producing inconsistent results with incorrect formatting, 2) the tone or style of speech is not correct, or 3) the reasoning is not being followed consistently. This axis maximizes **consistency of behavior**.
In reality this turns into a series of optimization steps, where we evaluate, make a hypothesis on how to optimize, apply it, evaluate, and re-assess for the next step. Heres an example of a fairly typical optimization flow:
![Accuracy mental model journey diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-02.png)
In this example, we do the following:
- Begin with a prompt, then evaluate its performance
- Add static few-shot examples, which should improve consistency of results
- Add a retrieval step so the few-shot examples are brought in dynamically based on the question - this boosts performance by ensuring relevant context for each input
- Prepare a dataset of 50+ examples and fine-tune a model to increase consistency
- Tune the retrieval and add a fact-checking step to find hallucinations to achieve higher accuracy
- Re-train the fine-tuned model on the new training examples which include our enhanced RAG inputs
This is a fairly typical optimization pipeline for a tough business problem - it helps us decide whether we need more relevant context or if we need more consistent behavior from the model. Once we make that decision, we know which lever to pull as our first step toward optimization.
Now that we have a mental model, lets dive into the methods for taking action on all of these areas. Well start in the bottom-left corner with Prompt Engineering.
### Prompt engineering
Prompt engineering is typically the best place to start\*\*. It is often the only method needed for use cases like summarization, translation, and code generation where a zero-shot approach can reach production levels of accuracy and consistency.
This is because it forces you to define what accuracy means for your use case - you start at the most basic level by providing an input, so you need to be able to judge whether or not the output matches your expectations. If it is not what you want, then the reasons **why** will show you what to use to drive further optimizations.
To achieve this, you should always start with a simple prompt and an expected output in mind, and then optimize the prompt by adding **context**, **instructions**, or **examples** until it gives you what you want.
#### Optimization
To optimize your prompts, Ill mostly lean on strategies from the [Prompt Engineering guide](https://platform.openai.com/docs/guides/prompt-engineering) in the OpenAI API documentation. Each strategy helps you tune Context, the LLM, or both:
| Strategy | Context optimization | LLM optimization |
| ----------------------------------------- | :------------------: | :--------------: |
| Write clear instructions | | X |
| Split complex tasks into simpler subtasks | X | X |
| Give GPTs time to "think" | | X |
| Test changes systematically | X | X |
| Provide reference text | X | |
| Use external tools | X | |
These can be a little difficult to visualize, so well run through an example where we test these out with a practical example. Lets use gpt-4-turbo to correct Icelandic sentences to see how this can work.
The [Icelandic Errors Corpus](https://repository.clarin.is/repository/xmlui/handle/20.500.12537/105) contains combinations of an Icelandic sentence with errors, and the corrected version of that sentence. Well use the baseline GPT-4 model to try to solve this task, and then apply different optimization techniques to see how we can improve the models performance.
Given an Icelandic sentence, we want the model to return a corrected version of the sentence. Well use Bleu score to measure the relative quality of the translation.
| system | user | ground_truth | assistant | BLEU |
| ------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------- | ---------------------------------------------------------- | ---------------------------------------------------------- | ---- |
| The following sentences contain Icelandic sentences which may include errors. Please correct these errors using as few word changes as possible. | Sörvistölur eru nær hálsi og skartgripir kvenna á brjótsti. | Sörvistölur eru nær hálsi og skartgripir kvenna á brjósti. | Sörvistölur eru nær hálsi og skartgripir kvenna á brjósti. | 1.0 |
We perform a first attempt with GPT-4 with no examples, and it performs decently, getting a BLEU score of 62.
Well now add some few-shot examples and see whether we can teach the model the style were looking for by showing rather than telling.
An example looks like this:
```example-chat
SYSTEM: The following sentences contain Icelandic sentences which may include errors. Please correct these errors using as few word changes as possible.
# Examples
USER: "Stofnendurnir séu margir og eru fulltrúar hennar frá Englandi, Grikklandi, Rússlandi, Svíþjóð og fleiri löndum Evrópu."
ASSISTANT: "Hann segir að stofnendur leynireglunnar séu margir og að fulltrúar hennar séu frá Englandi, Grikklandi, Rússlandi, Svíþjóð og fleiri löndum Evrópu."
USER: "Helsta fæða bjúgorma eru hægfara lífverur sem eru á sama búsvæði og bjúgormarnir, oft smærri ormar eins og burstormar (fræðiheiti: Polychatete)."
ASSISTANT: "Helsta fæða bjúgorma eru hægfara lífverur sem eru á sama búsvæði og bjúgormarnir, oft smærri ormar eins og burstaormar (fræðiheiti: Polychatete)."
USER: "Sörvistölur eru nær hálsi og skartgripir kvenna á brjótsti."
ASSISTANT: "Sörvistölur eru nær hálsi og skartgripir kvenna á brjósti."
USER: [input user query here]
```
The overall translation quality is better, showing an improvement to a Bleu score of **70 (+8%)**. This is pretty good, and shows us that giving the model examples of the task is helping it to learn.
This tells us that it is the **behavior** of the model that we need to optimize - it already has the knowledge that it needs to solve the problem, so providing many more examples may be the optimization we need.
Well revisit this later in the paper to test how our more advanced optimization methods play with this use case.
Weve seen that prompt engineering is a great place to start, and that with the right tuning methods we can push the performance pretty far.
However, the biggest issue with prompt engineering is that it often doesnt scale - we either need dynamic context to be fed to allow the model to deal with a wider range of problems than we can deal with through simple context stuffing or we need more consistent behavior than we can achieve with few-shot examples.
Long-context models allow prompt engineering to scale further - however, beware that
models can struggle to maintain attention across very large prompts with complex
instructions, and so you should always pair long context models with evaluation at
different context sizes to ensure you dont get
lost in the middle
. "Lost in the middle" is a term that addresses how an LLM can't pay equal attention
to all the tokens given to it at any one time. This can result in it missing information
seemingly randomly. This doesn't mean you shouldn't use long context, but you need to pair
it with thorough evaluation.
One open-source contributor, Greg Kamradt, made a useful evaluation called
Needle in A Haystack (NITA)
{" "}
which hid a piece of information at varying depths in long-context documents and
evaluated the retrieval quality. This illustrates the problem with long-context - it
promises a much simpler retrieval process where you can dump everything in context,
but at a cost in accuracy.
So how far can you really take prompt engineering? The answer is that it depends, and the way you make your decision is through evaluations.
### Evaluation
This is why **a good prompt with an evaluation set of questions and ground truth answers** is the best output from this stage. If we have a set of 20+ questions and answers, and we have looked into the details of the failures and have a hypothesis of why theyre occurring, then weve got the right baseline to take on more advanced optimization methods.
Before you move on to more sophisticated optimization methods, it's also worth considering how to automate this evaluation to speed up your iterations. Some common practices weve seen be effective here are:
- Using approaches like [ROUGE](https://aclanthology.org/W04-1013/) or [BERTScore](https://arxiv.org/abs/1904.09675) to provide a finger-in-the-air judgment. This doesnt correlate that closely with human reviewers, but can give a quick and effective measure of how much an iteration changed your model outputs.
- Using [GPT-4](https://arxiv.org/pdf/2303.16634.pdf) as an evaluator as outlined in the G-Eval paper, where you provide the LLM a scorecard to assess the output as objectively as possible.
If you want to dive deeper on these, check out [this cookbook](https://cookbook.openai.com/examples/evaluation/how_to_eval_abstractive_summarization) which takes you through all of them in practice.
## Understanding the tools
So youve done prompt engineering, youve got an eval set, and your model is still not doing what you need it to do. The most important next step is to diagnose where it is failing, and what tool works best to improve it.
Here is a basic framework for doing so:
![Classifying memory problem diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-03.png)
You can think of framing each failed evaluation question as an **in-context** or **learned** memory problem. As an analogy, imagine writing an exam. There are two ways you can ensure you get the right answer:
- You attend class for the last 6 months, where you see many repeated examples of how a particular concept works. This is **learned** memory - you solve this with LLMs by showing examples of the prompt and the response you expect, and the model learning from those.
- You have the textbook with you, and can look up the right information to answer the question with. This is **in-context** memory - we solve this in LLMs by stuffing relevant information into the context window, either in a static way using prompt engineering, or in an industrial way using RAG.
These two optimization methods are **additive, not exclusive** - they stack, and some use cases will require you to use them together to use optimal performance.
Lets assume that were facing a short-term memory problem - for this well use RAG to solve it.
### Retrieval-augmented generation (RAG)
RAG is the process of **R**etrieving content to **A**ugment your LLMs prompt before **G**enerating an answer. It is used to give the model **access to domain-specific context** to solve a task.
RAG is an incredibly valuable tool for increasing the accuracy and consistency of an LLM - many of our largest customer deployments at OpenAI were done using only prompt engineering and RAG.
![RAG diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-04.png)
In this example we have embedded a knowledge base of statistics. When our user asks a question, we embed that question and retrieve the most relevant content from our knowledge base. This is presented to the model, which answers the question.
RAG applications introduce a new axis we need to optimize against, which is retrieval. For our RAG to work, we need to give the right context to the model, and then assess whether the model is answering correctly. Ill frame these in a grid here to show a simple way to think about evaluation with RAG:
![RAG evaluation diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-05.png)
You have two areas your RAG application can break down:
| Area | Problem | Resolution |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Retrieval | You can supply the wrong context, so the model cant possibly answer, or you can supply too much irrelevant context, which drowns out the real information and causes hallucinations. | Optimizing your retrieval, which can include:- Tuning the search to return the right results.- Tuning the search to include less noise.- Providing more information in each retrieved resultThese are just examples, as tuning RAG performance is an industry into itself, with libraries like LlamaIndex and LangChain giving many approaches to tuning here. |
| LLM | The model can also get the right context and do the wrong thing with it. | Prompt engineering by improving the instructions and method the model uses, and, if showing it examples increases accuracy, adding in fine-tuning |
The key thing to take away here is that the principle remains the same from our mental model at the beginning - you evaluate to find out what has gone wrong, and take an optimization step to fix it. The only difference with RAG is you now have the retrieval axis to consider.
While useful, RAG only solves our in-context learning issues - for many use cases, the issue will be ensuring the LLM can learn a task so it can perform it consistently and reliably. For this problem we turn to fine-tuning.
### Fine-tuning
To solve a learned memory problem, many developers will continue the training process of the LLM on a smaller, domain-specific dataset to optimize it for the specific task. This process is known as **fine-tuning**.
Fine-tuning is typically performed for one of two reasons:
- **To improve model accuracy on a specific task:** Training the model on task-specific data to solve a learned memory problem by showing it many examples of that task being performed correctly.
- **To improve model efficiency:** Achieve the same accuracy for less tokens or by using a smaller model.
The fine-tuning process begins by preparing a dataset of training examples - this is the most critical step, as your fine-tuning examples must exactly represent what the model will see in the real world.
Many customers use a process known as **prompt baking**, where you extensively log
your prompt inputs and outputs during a pilot. These logs can be pruned into an
effective training set with realistic examples.
![Fine-tuning process diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-06.png)
Once you have this clean set, you can train a fine-tuned model by performing a **training** run - depending on the platform or framework youre using for training you may have hyperparameters you can tune here, similar to any other machine learning model. We always recommend maintaining a hold-out set to use for **evaluation** following training to detect overfitting. For tips on how to construct a good training set you can check out the [guidance](https://platform.openai.com/docs/guides/fine-tuning/analyzing-your-fine-tuned-model) in our Fine-tuning documentation, while for how to prep and tune the hold-out set there is more info [here](LINK_HERE). Once training is completed, the new, fine-tuned model is available for inference.
For optimizing fine-tuning well focus on best practices we observe with OpenAIs model customization offerings, but these principles should hold true with other providers and OSS offerings. The key practices to observe here are:
- **Start with prompt-engineering:** Have a solid evaluation set from prompt engineering which you can use as a baseline. This allows a low-investment approach until youre confident in your base prompt.
- **Start small, focus on quality:** Quality of training data is more important than quantity when fine-tuning on top of a foundation model. Start with 50+ examples, evaluate, and then dial your training set size up if you havent yet hit your accuracy needs, and if the issues causing incorrect answers are due to consistency/behavior and not context.
- **Ensure your examples are representative:** One of the most common pitfalls we see is non-representative training data, where the examples used for fine-tuning differ subtly in formatting or form from what the LLM sees in production. For example, if you have a RAG application, fine-tune the model with RAG examples in it so it isnt learning how to use the context zero-shot.
### All of the above
These techniques stack on top of each other - if your early evals show issues with both context and behavior, then it's likely you may end up with fine-tuning + RAG in your production solution. This is ok - these stack to balance the weaknesses of both approaches. Some of the main benefits are:
- Using fine-tuning to **minimize the tokens** used for prompt engineering, as you replace instructions and few-shot examples with many training examples to ingrain consistent behaviour in the model.
- **Teaching complex behavior** using extensive fine-tuning
- Using RAG to **inject context**, more recent content or any other specialized context required for your use cases
Well continue building on the Icelandic correction example we used above. Well test out the following approaches:
- Our original hypothesis was that this was a behavior optimization problem, so our first step will be to fine-tune a model. Well try both gpt-3.5-turbo and gpt-4 here.
- Well also try RAG - in this instance our hypothesis is that relevant examples might give additional context which could help the model solve the problem, but this is a lower confidence optimization.
#### Fine-tuning
To fine-tune for our use-case well use a dataset of 1000 examples similar to our few-shot examples above:
```example-chat
# One training example
SYSTEM: The following sentences contain Icelandic sentences which may include errors. Please correct these errors using as few word changes as possible.
USER: "Hið sameinaða fyrirtæki verður einn af stærstu bílaframleiðendum í heiminum."
ASSISTANT: "Hið sameinaða fyrirtæki verður einn af stærstu bílaframleiðendum heims."
```
We use these 1000 examples to train both gpt-3.5-turbo and gpt-4 fine-tuned models, and rerun our evaluation on our validation set. This confirmed our hypothesis - we got a meaningful bump in performance with both, with even the 3.5 model outperforming few-shot gpt-4 by 8 points:
| Run | Method | Bleu Score |
| --- | ------------------------------------------- | ---------- |
| 1 | gpt-4 with zero-shot | 62 |
| 2 | gpt-4 with 3 few-shot examples | 70 |
| 3 | gpt-3.5-turbo fine-tuned with 1000 examples | 78 |
| 4 | gpt-4 fine-tuned with 1000 examples | 87 |
Great, this is starting to look like production level accuracy for our use case. However, let's test whether we can squeeze a little more performance out of our pipeline by adding some relevant RAG examples to the prompt for in-context learning.
#### RAG + Fine-tuning
Our final optimization adds 1000 examples from outside of the training and validation sets which are embedded and placed in a vector database. We then run a further test with our gpt-4 fine-tuned model, with some perhaps surprising results:
![Icelandic case study diagram](https://openaiassets.blob.core.windows.net/$web/API/docs/images/diagram-optimizing-accuracy-07.png)
_Bleu Score per tuning method (out of 100)_
RAG actually **decreased** accuracy, dropping four points from our GPT-4 fine-tuned model to 83.
This illustrates the point that you use the right optimization tool for the right job - each offers benefits and risks that we manage with evaluations and iterative changes. The behavior we witnessed in our evals and from what we know about this question told us that this is a behavior optimization problem where additional context will not necessarily help the model. This was borne out in practice - RAG actually confounded the model by giving it extra noise when it had already learned the task effectively through fine-tuning.
We now have a model that should be close to production-ready, and if we want to optimize further we can consider a wider diversity and quantity of training examples.
Now you should have an appreciation for RAG and fine-tuning, and when each is appropriate. The last thing you should appreciate with these tools is that once you introduce them there is a trade-off here in our speed to iterate:
- For RAG you need to tune the retrieval as well as LLM behavior
- With fine-tuning you need to rerun the fine-tuning process and manage your training and validation sets when you do additional tuning.
Both of these can be time-consuming and complex processes, which can introduce regression issues as your LLM application becomes more complex. If you take away one thing from this paper, let it be to squeeze as much accuracy out of basic methods as you can before reaching for more complex RAG or fine-tuning - let your accuracy target be the objective, not jumping for RAG + FT because they are perceived as the most sophisticated.
## How much accuracy is “good enough” for production
Tuning for accuracy can be a never-ending battle with LLMs - they are unlikely to get to 99.999% accuracy using off-the-shelf methods. This section is all about deciding when is enough for accuracy - how do you get comfortable putting an LLM in production, and how do you manage the risk of the solution you put out there.
I find it helpful to think of this in both a **business** and **technical** context. Im going to describe the high level approaches to managing both, and use a customer service help-desk use case to illustrate how we manage our risk in both cases.
### Business
For the business it can be hard to trust LLMs after the comparative certainties of rules-based or traditional machine learning systems, or indeed humans! A system where failures are open-ended and unpredictable is a difficult circle to square.
An approach Ive seen be successful here was for a customer service use case - for this, we did the following:
First we identify the primary success and failure cases, and assign an estimated cost to them. This gives us a clear articulation of what the solution is likely to save or cost based on pilot performance.
- For example, a case getting solved by an AI where it was previously solved by a human may save $20.
- Someone getting escalated to a human when they shouldnt might cost **$40**
- In the worst case scenario, a customer gets so frustrated with the AI they churn, costing us **$1000**. We assume this happens in 5% of cases.
| Event | Value | Number of cases | Total value |
| ----------------------- | ----- | --------------- | ----------- |
| AI success | +20 | 815 | $16,300 |
| AI failure (escalation) | -40 | 175.75 | $7,030 |
| AI failure (churn) | -1000 | 9.25 | $9,250 |
| **Result** | | | **+20** |
| **Break-even accuracy** | | | **81.5%** |
The other thing we did is to measure the empirical stats around the process which will help us measure the macro impact of the solution. Again using customer service, these could be:
- The CSAT score for purely human interactions vs. AI ones
- The decision accuracy for retrospectively reviewed cases for human vs. AI
- The time to resolution for human vs. AI
In the customer service example, this helped us make two key decisions following a few pilots to get clear data:
1. Even if our LLM solution escalated to humans more than we wanted, it still made an enormous operational cost saving over the existing solution. This meant that an accuracy of even 85% could be ok, if those 15% were primarily early escalations.
2. Where the cost of failure was very high, such as a fraud case being incorrectly resolved, we decided the human would drive and the AI would function as an assistant. In this case, the decision accuracy stat helped us make the call that we werent comfortable with full autonomy.
### Technical
On the technical side it is more clear - now that the business is clear on the value they expect and the cost of what can go wrong, your role is to build a solution that handles failures gracefully in a way that doesnt disrupt the user experience.
Lets use the customer service example one more time to illustrate this, and well assume weve got a model that is 85% accurate in determining intent. As a technical team, here are a few ways we can minimize the impact of the incorrect 15%:
- We can prompt engineer the model to prompt the customer for more information if it isnt confident, so our first-time accuracy may drop but we may be more accurate given 2 shots to determine intent.
- We can give the second-line assistant the option to pass back to the intent determination stage, again giving the UX a way of self-healing at the cost of some additional user latency.
- We can prompt engineer the model to hand off to a human if the intent is unclear, which costs us some operational savings in the short-term but may offset customer churn risk in the long term.
Those decisions then feed into our UX, which gets slower at the cost of higher accuracy, or more human interventions, which feed into the cost model covered in the business section above.
You now have an approach to breaking down the business and technical decisions involved in setting an accuracy target that is grounded in business reality.
## Taking this forward
This is a high level mental model for thinking about maximizing accuracy for LLMs, the tools you can use to achieve it, and the approach for deciding where enough is enough for production. You have the framework and tools you need to get to production consistently, and if you want to be inspired by what others have achieved with these methods then look no further than our customer stories, where use cases like [Morgan Stanley](https://openai.com/customer-stories/morgan-stanley) and [Klarna](https://openai.com/customer-stories/klarna) show what you can achieve by leveraging these techniques.
Best of luck, and were excited to see what you build with this!

@ -0,0 +1,85 @@
You can use the 'create and stream' helpers in the Python and Node SDKs to create a run and stream the response.
<CodeSample
title="Create and Stream a Run"
defaultLanguage="python"
code={{
python: `
from typing_extensions import override
from openai import AssistantEventHandler
# First, we create a EventHandler class to define
# how we want to handle the events in the response stream.
class EventHandler(AssistantEventHandler):
@override
def on_text_created(self, text) -> None:
print(f"\\nassistant > ", end="", flush=True)
@override
def on_text_delta(self, delta, snapshot):
print(delta.value, end="", flush=True)
def on_tool_call_created(self, tool_call):
print(f"\\nassistant > {tool_call.type}\\n", flush=True)
def on_tool_call_delta(self, delta, snapshot):
if delta.type == 'code_interpreter':
if delta.code_interpreter.input:
print(delta.code_interpreter.input, end="", flush=True)
if delta.code_interpreter.outputs:
print(f"\\n\\noutput >", flush=True)
for output in delta.code_interpreter.outputs:
if output.type == "logs":
print(f"\\n{output.logs}", flush=True)
# Then, we use the \`stream\` SDK helper
# with the \`EventHandler\` class to create the Run
# and stream the response.
with client.beta.threads.runs.stream(
thread_id=thread.id,
assistant_id=assistant.id,
instructions="Please address the user as Jane Doe. The user has a premium account.",
event_handler=EventHandler(),
) as stream:
stream.until_done()
`.trim(),
"node.js": `
// We use the stream SDK helper to create a run with
// streaming. The SDK provides helpful event listeners to handle
// the streamed response.
const run = openai.beta.threads.runs.stream(thread.id, {
assistant_id: assistant.id
})
.on('textCreated', (text) => process.stdout.write('\\nassistant > '))
.on('textDelta', (textDelta, snapshot) => process.stdout.write(textDelta.value))
.on('toolCallCreated', (toolCall) => process.stdout.write(\`\\nassistant > $\{toolCall.type\}\\n\\n\`))
.on('toolCallDelta', (toolCallDelta, snapshot) => {
if (toolCallDelta.type === 'code_interpreter') {
if (toolCallDelta.code_interpreter.input) {
process.stdout.write(toolCallDelta.code_interpreter.input);
}
if (toolCallDelta.code_interpreter.outputs) {
process.stdout.write("\\noutput >\\n");
toolCallDelta.code_interpreter.outputs.forEach(output => {
if (output.type === "logs") {
process.stdout.write(\`\\n$\{output.logs\}\\n\`);
}
});
}
}
});
`.trim(),
}}
/>
See the full list of Assistants streaming events in our API reference [here](/docs/api-reference/assistants-streaming/events). You can also see a list of SDK event listeners for these events in the [Python](https://github.com/openai/openai-python/blob/main/helpers.md#assistant-events) & [Node](https://github.com/openai/openai-node/blob/master/helpers.md#assistant-events) repository documentation.
### Next
1. Dive deeper into [How Assistants work](/docs/assistants/how-it-works)
2. Learn more about [Tools](/docs/assistants/tools)
3. Explore the [Assistants playground](/playground?mode=assistant)

@ -0,0 +1,78 @@
Runs are asynchronous, which means you'll want to monitor their `status` by polling the Run object until a [terminal status](/docs/assistants/how-it-works/runs-and-run-steps) is reached. For convenience, the 'create and poll' SDK helpers assist both in creating the run and then polling for its completion.
<CodeSample
title="Create a Run"
defaultLanguage="python"
code={{
python: `
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id,
assistant_id=assistant.id,
instructions="Please address the user as Jane Doe. The user has a premium account."
)
`.trim(),
"node.js": `
let run = await openai.beta.threads.runs.createAndPoll(
thread.id,
{
assistant_id: assistant.id,
instructions: "Please address the user as Jane Doe. The user has a premium account."
}
);
`.trim(),
curl: `
curl https://api.openai.com/v1/threads/thread_abc123/runs \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"assistant_id": "asst_abc123",
"instructions": "Please address the user as Jane Doe. The user has a premium account."
}'
`.trim(),
}}
/>
Once the Run completes, you can [list the Messages](/docs/api-reference/messages/listMessages) added to the Thread by the Assistant.
<CodeSample
defaultLanguage="python"
code={{
python: `
if run.status == 'completed':
messages = client.beta.threads.messages.list(
thread_id=thread.id
)
print(messages)
else:
print(run.status)
`.trim(),
"node.js": `
if (run.status === 'completed') {
const messages = await openai.beta.threads.messages.list(
run.thread_id
);
for (const message of messages.data.reverse()) {
console.log(\`$\{message.role\} > $\{message.content[0].text.value\}\`);
}
} else {
console.log(run.status);
}
`.trim(),
curl: `
curl https://api.openai.com/v1/threads/thread_abc123/messages \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "OpenAI-Beta: assistants=v2"
`.trim(),
}}
/>
You may also want to list the [Run Steps](/docs/api-reference/runs/listRunSteps) of this Run if you'd like to look at any tool calls made during this Run.
### Next
1. Dive deeper into [How Assistants work](/docs/assistants/how-it-works)
2. Learn more about [Tools](/docs/assistants/tools)
3. Explore the [Assistants playground](/playground?mode=assistant)

@ -0,0 +1,155 @@
# Production best practices
This guide provides a comprehensive set of best practices to help you transition from prototype to production. Whether you are a seasoned machine learning engineer or a recent enthusiast, this guide should provide you with the tools you need to successfully put the platform to work in a production setting: from securing access to our API to designing a robust architecture that can handle high traffic volumes. Use this guide to help develop a plan for deploying your application as smoothly and effectively as possible.
If you want to explore best practices for going into production further, please check out our Developer Day talk:
<iframe
width="100%"
height="315"
src="https://www.youtube-nocookie.com/embed/XGJNo8TpuVA?si=mvYm3Un23iHnlXcg"
title="YouTube video player"
frameBorder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowFullScreen
>
## Setting up your organization
Once you [log in](/login) to your OpenAI account, you can find your organization name and ID in your [organization settings](/account/organization). The organization name is the label for your organization, shown in user interfaces. The organization ID is the unique identifier for your organization which can be used in API requests.
Users who belong to multiple organizations can [pass a header](/docs/api-reference/requesting-organization) to specify which organization is used for an API request. Usage from these API requests will count against the specified organization's quota. If no header is provided, the [default organization](/account/api-keys) will be billed. You can change your default organization in your [user settings](/account/api-keys).
You can invite new members to your organization from the [Team page](/account/team). Members can be **readers** or **owners**. Readers can make API requests and view basic organization information, while owners can modify billing information and manage members within an organization.
### Managing billing limits
To begin using the OpenAI API, enter your [billing information](/account/billing/overview). If no billing information is entered, you will still have login access but will be unable to make API requests.
Once youve entered your billing information, you will have an approved usage limit of $100 per month, which is set by OpenAI. Your quota limit will automatically increase as your usage on your platform increases and you move from one [usage tier](/docs/guides/rate-limits/usage-tiers) to another. You can review your current usage limit in the [limits](/account/rate-limits) page in your account settings.
If youd like to be notified when your usage exceeds a certain dollar amount, you can set a notification threshold through the [usage limits](/account/limits) page. When the notification threshold is reached, the owners of the organization will receive an email notification. You can also set a monthly budget so that, once the monthly budget is reached, any subsequent API requests will be rejected. Note that these limits are best effort, and there may be 5 to 10 minutes of delay between the usage and the limits being enforced.
### API keys
The OpenAI API uses API keys for authentication. Visit your [API keys](/account/api-keys) page to retrieve the API key you'll use in your requests.
This is a relatively straightforward way to control access, but you must be vigilant about securing these keys. Avoid exposing the API keys in your code or in public repositories; instead, store them in a secure location. You should expose your keys to your application using environment variables or secret management service, so that you don't need to hard-code them in your codebase. Read more in our [Best practices for API key safety](https://help.openai.com/en/articles/5112595-best-practices-for-api-key-safety).
API key usage can be monitored on the [Usage page](/usage) once tracking is enabled. If you are using an API key generated prior to Dec 20, 2023 tracking will not be enabled by default. You can enable tracking going forward on the [API key management dashboard](/api-keys). All API keys generated past Dec 20, 2023 have tracking enabled. Any previous untracked usage will be displayed as `Untracked` in the dashboard.
### Staging accounts
As you scale, you may want to create separate organizations for your staging and production environments. Please note that you can sign up using two separate email addresses like bob+prod@widgetcorp.com and bob+dev@widgetcorp.com to create two organizations. This will allow you to isolate your development and testing work so you don't accidentally disrupt your live application. You can also limit access to your production organization this way.
## Scaling your solution architecture
When designing your application or service for production that uses our API, it's important to consider how you will scale to meet traffic demands. There are a few key areas you will need to consider regardless of the cloud service provider of your choice:
- **Horizontal scaling**: You may want to scale your application out horizontally to accommodate requests to your application that come from multiple sources. This could involve deploying additional servers or containers to distribute the load. If you opt for this type of scaling, make sure that your architecture is designed to handle multiple nodes and that you have mechanisms in place to balance the load between them.
- **Vertical scaling**: Another option is to scale your application up vertically, meaning you can beef up the resources available to a single node. This would involve upgrading your server's capabilities to handle the additional load. If you opt for this type of scaling, make sure your application is designed to take advantage of these additional resources.
- **Caching**: By storing frequently accessed data, you can improve response times without needing to make repeated calls to our API. Your application will need to be designed to use cached data whenever possible and invalidate the cache when new information is added. There are a few different ways you could do this. For example, you could store data in a database, filesystem, or in-memory cache, depending on what makes the most sense for your application.
- **Load balancing**: Finally, consider load-balancing techniques to ensure requests are distributed evenly across your available servers. This could involve using a load balancer in front of your servers or using DNS round-robin. Balancing the load will help improve performance and reduce bottlenecks.
## Managing rate limits
When using our API, it's important to understand and plan for [rate limits](/docs/guides/rate-limits).
## Improving latencies
Check out our most up-to-date guide on{" "}
latency optimization.
Latency is the time it takes for a request to be processed and a response to be returned. In this section, we will discuss some factors that influence the latency of our text generation models and provide suggestions on how to reduce it.
The latency of a completion request is mostly influenced by two factors: the model and the number of tokens generated. The life cycle of a completion request looks like this:
The bulk of the latency typically arises from the token generation step.
> **Intuition**: Prompt tokens add very little latency to completion calls. Time to generate completion tokens is much longer, as tokens are generated one at a time. Longer generation lengths will accumulate latency due to generation required for each token.
### Common factors affecting latency and possible mitigation techniques
Now that we have looked at the basics of latency, lets take a look at various factors that can affect latency, broadly ordered from most impactful to least impactful.
#### Model
Our API offers different models with varying levels of complexity and generality. The most capable models, such as `gpt-4`, can generate more complex and diverse completions, but they also take longer to process your query.
Models such as `gpt-3.5-turbo`, can generate faster and cheaper chat completions, but they may generate results that are less accurate or relevant for your query. You can choose the model that best suits your use case and the trade-off between speed and quality.
#### Number of completion tokens
Requesting a large amount of generated tokens completions can lead to increased latencies:
- **Lower max tokens**: for requests with a similar token generation count, those that have a lower `max_tokens` parameter incur less latency.
- **Include stop sequences**: to prevent generating unneeded tokens, add a stop sequence. For example, you can use stop sequences to generate a list with a specific number of items. In this case, by using `11.` as a stop sequence, you can generate a list with only 10 items, since the completion will stop when `11.` is reached. [Read our help article on stop sequences](https://help.openai.com/en/articles/5072263-how-do-i-use-stop-sequences) for more context on how you can do this.
- **Generate fewer completions**: lower the values of `n` and `best_of` when possible where `n` refers to how many completions to generate for each prompt and `best_of` is used to represent the result with the highest log probability per token.
If `n` and `best_of` both equal 1 (which is the default), the number of generated tokens will be at most, equal to `max_tokens`.
If `n` (the number of completions returned) or `best_of` (the number of completions generated for consideration) are set to `> 1`, each request will create multiple outputs. Here, you can consider the number of generated tokens as `[ max_tokens * max (n, best_of) ]`
#### Streaming
Setting `stream: true` in a request makes the model start returning tokens as soon as they are available, instead of waiting for the full sequence of tokens to be generated. It does not change the time to get all the tokens, but it reduces the time for first token for an application where we want to show partial progress or are going to stop generations. This can be a better user experience and a UX improvement so its worth experimenting with streaming.
#### Infrastructure
Our servers are currently located in the US. While we hope to have global redundancy in the future, in the meantime you could consider locating the relevant parts of your infrastructure in the US to minimize the roundtrip time between your servers and the OpenAI servers.
#### Batching
Depending on your use case, batching may help. If you are sending multiple requests to the same endpoint, you can [batch the prompts](/docs/guides/rate-limits/batching-requests) to be sent in the same request. This will reduce the number of requests you need to make. The prompt parameter can hold up to 20 unique prompts. We advise you to test out this method and see if it helps. In some cases, you may end up increasing the number of generated tokens which will slow the response time.
## Managing costs
To monitor your costs, you can set a [notification threshold](/account/limits) in your account to receive an email alert once you pass a certain usage threshold. You can also set a [monthly budget](/account/limits). Please be mindful of the potential for a monthly budget to cause disruptions to your application/users. Use the [usage tracking dashboard](/account/usage) to monitor your token usage during the current and past billing cycles.
### Text generation
One of the challenges of moving your prototype into production is budgeting for the costs associated with running your application. OpenAI offers a [pay-as-you-go pricing model](https://openai.com/api/pricing/), with prices per 1,000 tokens (roughly equal to 750 words). To estimate your costs, you will need to project the token utilization. Consider factors such as traffic levels, the frequency with which users will interact with your application, and the amount of data you will be processing.
**One useful framework for thinking about reducing costs is to consider costs as a function of the number of tokens and the cost per token.** There are two potential avenues for reducing costs using this framework. First, you could work to reduce the cost per token by switching to smaller models for some tasks in order to reduce costs. Alternatively, you could try to reduce the number of tokens required. There are a few ways you could do this, such as by using shorter prompts, [fine-tuning](/docs/guides/fine-tuning) models, or caching common user queries so that they don't need to be processed repeatedly.
You can experiment with our interactive [tokenizer tool](/tokenizer) to help you estimate costs. The API and playground also returns token counts as part of the response. Once youve got things working with our most capable model, you can see if the other models can produce the same results with lower latency and costs. Learn more in our [token usage help article](https://help.openai.com/en/articles/6614209-how-do-i-check-my-token-usage).
## MLOps strategy
As you move your prototype into production, you may want to consider developing an MLOps strategy. MLOps (machine learning operations) refers to the process of managing the end-to-end life cycle of your machine learning models, including any models you may be fine-tuning using our API. There are a number of areas to consider when designing your MLOps strategy. These include
- Data and model management: managing the data used to train or fine-tune your model and tracking versions and changes.
- Model monitoring: tracking your model's performance over time and detecting any potential issues or degradation.
- Model retraining: ensuring your model stays up to date with changes in data or evolving requirements and retraining or fine-tuning it as needed.
- Model deployment: automating the process of deploying your model and related artifacts into production.
Thinking through these aspects of your application will help ensure your model stays relevant and performs well over time.
## Security and compliance
As you move your prototype into production, you will need to assess and address any security and compliance requirements that may apply to your application. This will involve examining the data you are handling, understanding how our API processes data, and determining what regulations you must adhere to. Our [security practices](https://www.openai.com/security) and [trust and compliance portal](https://trust.openai.com/) provide our most comprehensive and up-to-date documentation. For reference, here is our [Privacy Policy](https://openai.com/privacy/) and [Terms of Use](https://openai.com/api/policies/terms/).
Some common areas you'll need to consider include data storage, data transmission, and data retention. You might also need to implement data privacy protections, such as encryption or anonymization where possible. In addition, you should follow best practices for secure coding, such as input sanitization and proper error handling.
### Safety best practices
When creating your application with our API, consider our [safety best practices](/docs/guides/safety-best-practices) to ensure your application is safe and successful. These recommendations highlight the importance of testing the product extensively, being proactive about addressing potential issues, and limiting opportunities for misuse.
## Business considerations
As projects using AI move from prototype to production, it is important to consider how to build a great product with AI and how that ties back to your core business. We certainly don't have all the answers but a great starting place is a talk from our Developer Day where we dive into this with some of our customers:
<iframe
width="100%"
height="315"
src="https://www.youtube-nocookie.com/embed/knHW-p31R0c?si=g0ddoMoUykjclH4k"
title="YouTube video player"
frameBorder="0"
allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share"
allowFullScreen
>

@ -0,0 +1,34 @@
# Actions in production
## Rate limits
Consider implementing rate limiting on the API endpoints you expose. ChatGPT will respect 429 response codes and dynamically back off from sending requests to your action after receiving a certain number of 429's or 500's in a short period of time.
### Timeouts
When making API calls during the actions experience, timeouts take place if the following thresholds are exceeded:
- 45 seconds round trip for API calls
### Use TLS and HTTPS
All traffic to your action must use TLS 1.2 or later on port 443 with a valid public certificate.
### IP egress ranges
ChatGPT will call your action from an IP address from one of these [CIDR blocks](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing):
- `23.102.140.112/28`
- `13.66.11.96/28`
- `104.210.133.240/28`
- `20.97.188.144/28`
- `20.161.76.48/28`
- `52.234.32.208/28`
- `52.156.132.32/28`
- `40.84.180.128/28`
You may wish to explicitly allowlist these IP addresses.
### How is Action data used?
Actions connect ChatGPT to external apps. If a user interacts with a GPTs custom action, ChatGPT may send parts of their conversation to the actions endpoint.

@ -0,0 +1,578 @@
Prompt engineering
This guide shares strategies and tactics for getting better results from large language models (sometimes referred to as GPT models) like GPT-4o. The methods described here can sometimes be deployed in combination for greater effect. We encourage experimentation to find the methods that work best for you.
You can also explore example prompts which showcase what our models are capable of:
<IconItem
icon={}
color="green"
title="Prompt examples"
className="mt-6"
>
Explore prompt examples to learn what GPT models can do
## Six strategies for getting better results
### Write clear instructions
These models cant read your mind. If outputs are too long, ask for brief replies. If outputs are too simple, ask for expert-level writing. If you dislike the format, demonstrate the format youd like to see. The less the model has to guess at what you want, the more likely youll get it.
Tactics:
- [Include details in your query to get more relevant answers](/docs/guides/prompt-engineering/tactic-include-details-in-your-query-to-get-more-relevant-answers)
- [Ask the model to adopt a persona](/docs/guides/prompt-engineering/tactic-ask-the-model-to-adopt-a-persona)
- [Use delimiters to clearly indicate distinct parts of the input](/docs/guides/prompt-engineering/tactic-use-delimiters-to-clearly-indicate-distinct-parts-of-the-input)
- [Specify the steps required to complete a task](/docs/guides/prompt-engineering/tactic-specify-the-steps-required-to-complete-a-task)
- [Provide examples](/docs/guides/prompt-engineering/tactic-provide-examples)
- [Specify the desired length of the output](/docs/guides/prompt-engineering/tactic-specify-the-desired-length-of-the-output)
### Provide reference text
Language models can confidently invent fake answers, especially when asked about esoteric topics or for citations and URLs. In the same way that a sheet of notes can help a student do better on a test, providing reference text to these models can help in answering with fewer fabrications.
Tactics:
- [Instruct the model to answer using a reference text](/docs/guides/prompt-engineering/tactic-instruct-the-model-to-answer-using-a-reference-text)
- [Instruct the model to answer with citations from a reference text](/docs/guides/prompt-engineering/tactic-instruct-the-model-to-answer-with-citations-from-a-reference-text)
### Split complex tasks into simpler subtasks
Just as it is good practice in software engineering to decompose a complex system into a set of modular components, the same is true of tasks submitted to a language model. Complex tasks tend to have higher error rates than simpler tasks. Furthermore, complex tasks can often be re-defined as a workflow of simpler tasks in which the outputs of earlier tasks are used to construct the inputs to later tasks.
Tactics:
- [Use intent classification to identify the most relevant instructions for a user query](/docs/guides/prompt-engineering/tactic-use-intent-classification-to-identify-the-most-relevant-instructions-for-a-user-query)
- [For dialogue applications that require very long conversations, summarize or filter previous dialogue](/docs/guides/prompt-engineering/tactic-for-dialogue-applications-that-require-very-long-conversations-summarize-or-filter-previous-dialogue)
- [Summarize long documents piecewise and construct a full summary recursively](/docs/guides/prompt-engineering/tactic-summarize-long-documents-piecewise-and-construct-a-full-summary-recursively)
### Give the model time to "think"
If asked to multiply 17 by 28, you might not know it instantly, but can still work it out with time. Similarly, models make more reasoning errors when trying to answer right away, rather than taking time to work out an answer. Asking for a "chain of thought" before an answer can help the model reason its way toward correct answers more reliably.
Tactics:
- [Instruct the model to work out its own solution before rushing to a conclusion](/docs/guides/prompt-engineering/tactic-instruct-the-model-to-work-out-its-own-solution-before-rushing-to-a-conclusion)
- [Use inner monologue or a sequence of queries to hide the model's reasoning process](/docs/guides/prompt-engineering/tactic-use-inner-monologue-or-a-sequence-of-queries-to-hide-the-model-s-reasoning-process)
- [Ask the model if it missed anything on previous passes](/docs/guides/prompt-engineering/tactic-ask-the-model-if-it-missed-anything-on-previous-passes)
### Use external tools
Compensate for the weaknesses of the model by feeding it the outputs of other tools. For example, a text retrieval system (sometimes called RAG or retrieval augmented generation) can tell the model about relevant documents. A code execution engine like OpenAI's Code Interpreter can help the model do math and run code. If a task can be done more reliably or efficiently by a tool rather than by a language model, offload it to get the best of both.
Tactics:
- [Use embeddings-based search to implement efficient knowledge retrieval](/docs/guides/prompt-engineering/tactic-use-embeddings-based-search-to-implement-efficient-knowledge-retrieval)
- [Use code execution to perform more accurate calculations or call external APIs](/docs/guides/prompt-engineering/tactic-use-code-execution-to-perform-more-accurate-calculations-or-call-external-apis)
- [Give the model access to specific functions](/docs/guides/prompt-engineering/tactic-give-the-model-access-to-specific-functions)
### Test changes systematically
Improving performance is easier if you can measure it. In some cases a modification to a prompt will achieve better performance on a few isolated examples but lead to worse overall performance on a more representative set of examples. Therefore to be sure that a change is net positive to performance it may be necessary to define a comprehensive test suite (also known an as an "eval").
Tactic:
- [Evaluate model outputs with reference to gold-standard answers](/docs/guides/prompt-engineering/tactic-evaluate-model-outputs-with-reference-to-gold-standard-answers)
## Tactics
Each of the strategies listed above can be instantiated with specific tactics. These tactics are meant to provide ideas for things to try. They are by no means fully comprehensive, and you should feel free to try creative ideas not represented here.
### Strategy: Write clear instructions
#### Tactic: Include details in your query to get more relevant answers
In order to get a highly relevant response, make sure that requests provide any important details or context. Otherwise you are leaving it up to the model to guess what you mean.
| | |
| ----------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **Worse** | **Better** |
| How do I add numbers in Excel? | How do I add up a row of dollar amounts in Excel? I want to do this automatically for a whole sheet of rows with all the totals ending up on the right in a column called "Total". |
| Whos president? | Who was the president of Mexico in 2021, and how frequently are elections held? |
| Write code to calculate the Fibonacci sequence. | Write a TypeScript function to efficiently calculate the Fibonacci sequence. Comment the code liberally to explain what each piece does and why it's written that way. |
| Summarize the meeting notes. | Summarize the meeting notes in a single paragraph. Then write a markdown list of the speakers and each of their key points. Finally, list the next steps or action items suggested by the speakers, if any. |
#### Tactic: Ask the model to adopt a persona
The system message can be used to specify the persona used by the model in its replies.
```example-chat link=/playground/p/default-playful-thank-you-note
SYSTEM: When I ask for help to write something, you will reply with a document that contains at least one joke or playful comment in every paragraph.
USER: Write a thank you note to my steel bolt vendor for getting the delivery in on time and in short notice. This made it possible for us to deliver an important order.
```
#### Tactic: Use delimiters to clearly indicate distinct parts of the input
Delimiters like triple quotation marks, XML tags, section titles, etc. can help demarcate sections of text to be treated differently.
```example-chat link=/playground/p/default-delimiters-1
USER: Summarize the text delimited by triple quotes with a haiku.
"""insert text here"""
```
```example-chat link=/playground/p/default-delimiters-2
SYSTEM: You will be provided with a pair of articles (delimited with XML tags) about the same topic. First summarize the arguments of each article. Then indicate which of them makes a better argument and explain why.
USER: insert first article here
insert second article here
```
```example-chat link=/playground/p/default-delimiters-3
SYSTEM: You will be provided with a thesis abstract and a suggested title for it. The thesis title should give the reader a good idea of the topic of the thesis but should also be eye-catching. If the title does not meet these criteria, suggest 5 alternatives.
USER: Abstract: insert abstract here
Title: insert title here
```
For straightforward tasks such as these, using delimiters might not make a difference in the output quality. However, the more complex a task is the more important it is to disambiguate task details. Dont make the model work to understand exactly what you are asking of them.
#### Tactic: Specify the steps required to complete a task
Some tasks are best specified as a sequence of steps. Writing the steps out explicitly can make it easier for the model to follow them.
```example-chat link=/playground/p/default-step-by-step-summarize-and-translate
SYSTEM: Use the following step-by-step instructions to respond to user inputs.
Step 1 - The user will provide you with text in triple quotes. Summarize this text in one sentence with a prefix that says "Summary: ".
Step 2 - Translate the summary from Step 1 into Spanish, with a prefix that says "Translation: ".
USER: """insert text here"""
```
#### Tactic: Provide examples
Providing general instructions that apply to all examples is generally more efficient than demonstrating all permutations of a task by example, but in some cases providing examples may be easier. For example, if you intend for the model to copy a particular style of responding to user queries which is difficult to describe explicitly. This is known as "few-shot" prompting.
```example-chat link=/playground/p/default-chat-few-shot
SYSTEM: Answer in a consistent style.
USER: Teach me about patience.
ASSISTANT: The river that carves the deepest valley flows from a modest spring; the grandest symphony originates from a single note; the most intricate tapestry begins with a solitary thread.
USER: Teach me about the ocean.
```
#### Tactic: Specify the desired length of the output
You can ask the model to produce outputs that are of a given target length. The targeted output length can be specified in terms of the count of words, sentences, paragraphs, bullet points, etc. Note however that instructing the model to generate a specific number of words does not work with high precision. The model can more reliably generate outputs with a specific number of paragraphs or bullet points.
```example-chat link=/playground/p/default-summarize-text-50-words
USER: Summarize the text delimited by triple quotes in about 50 words.
"""insert text here"""
```
```example-chat link=/playground/p/default-summarize-text-2-paragraphs
USER: Summarize the text delimited by triple quotes in 2 paragraphs.
"""insert text here"""
```
```example-chat link=/playground/p/default-summarize-text-3-bullet-points
USER: Summarize the text delimited by triple quotes in 3 bullet points.
"""insert text here"""
```
### Strategy: Provide reference text
#### Tactic: Instruct the model to answer using a reference text
If we can provide a model with trusted information that is relevant to the current query, then we can instruct the model to use the provided information to compose its answer.
```example-chat link=/playground/p/default-answer-from-retrieved-documents
SYSTEM: Use the provided articles delimited by triple quotes to answer questions. If the answer cannot be found in the articles, write "I could not find an answer."
USER:
Question:
```
Given that all models have limited context windows, we need some way to dynamically lookup information that is relevant to the question being asked. [Embeddings](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings) can be used to implement efficient knowledge retrieval. See the tactic ["Use embeddings-based search to implement efficient knowledge retrieval"](/docs/guides/prompt-engineering/tactic-use-embeddings-based-search-to-implement-efficient-knowledge-retrieval) for more details on how to implement this.
#### Tactic: Instruct the model to answer with citations from a reference text
If the input has been supplemented with relevant knowledge, it's straightforward to request that the model add citations to its answers by referencing passages from provided documents. Note that citations in the output can then be verified programmatically by string matching within the provided documents.
```example-chat link=/playground/p/default-answer-with-citation
SYSTEM: You will be provided with a document delimited by triple quotes and a question. Your task is to answer the question using only the provided document and to cite the passage(s) of the document used to answer the question. If the document does not contain the information needed to answer this question then simply write: "Insufficient information." If an answer to the question is provided, it must be annotated with a citation. Use the following format for to cite relevant passages ({"citation": …}).
USER: """"""
Question:
```
### Strategy: Split complex tasks into simpler subtasks
#### Tactic: Use intent classification to identify the most relevant instructions for a user query
For tasks in which lots of independent sets of instructions are needed to handle different cases, it can be beneficial to first classify the type of query and to use that classification to determine which instructions are needed. This can be achieved by defining fixed categories and hardcoding instructions that are relevant for handling tasks in a given category. This process can also be applied recursively to decompose a task into a sequence of stages. The advantage of this approach is that each query will contain only those instructions that are required to perform the next stage of a task which can result in lower error rates compared to using a single query to perform the whole task. This can also result in lower costs since larger prompts cost more to run ([see pricing information](https://openai.com/pricing)).
Suppose for example that for a customer service application, queries could be usefully classified as follows:
```example-chat link=/playground/p/default-decomposition-by-intent-classification-1
SYSTEM: You will be provided with customer service queries. Classify each query into a primary category and a secondary category. Provide your output in json format with the keys: primary and secondary.
Primary categories: Billing, Technical Support, Account Management, or General Inquiry.
Billing secondary categories:
- Unsubscribe or upgrade
- Add a payment method
- Explanation for charge
- Dispute a charge
Technical Support secondary categories:
- Troubleshooting
- Device compatibility
- Software updates
Account Management secondary categories:
- Password reset
- Update personal information
- Close account
- Account security
General Inquiry secondary categories:
- Product information
- Pricing
- Feedback
- Speak to a human
USER: I need to get my internet working again.
```
Based on the classification of the customer query, a set of more specific instructions can be provided to a model for it to handle next steps. For example, suppose the customer requires help with "troubleshooting".
```example-chat link=/playground/p/default-decomposition-by-intent-classification-2
SYSTEM: You will be provided with customer service inquiries that require troubleshooting in a technical support context. Help the user by:
- Ask them to check that all cables to/from the router are connected. Note that it is common for cables to come loose over time.
- If all cables are connected and the issue persists, ask them which router model they are using
- Now you will advise them how to restart their device:
-- If the model number is MTD-327J, advise them to push the red button and hold it for 5 seconds, then wait 5 minutes before testing the connection.
-- If the model number is MTD-327S, advise them to unplug and replug it, then wait 5 minutes before testing the connection.
- If the customer's issue persists after restarting the device and waiting 5 minutes, connect them to IT support by outputting {"IT support requested"}.
- If the user starts asking questions that are unrelated to this topic then confirm if they would like to end the current chat about troubleshooting and classify their request according to the following scheme:
USER: I need to get my internet working again.
```
Notice that the model has been instructed to emit special strings to indicate when the state of the conversation changes. This enables us to turn our system into a state machine where the state determines which instructions are injected. By keeping track of state, what instructions are relevant at that state, and also optionally what state transitions are allowed from that state, we can put guardrails around the user experience that would be hard to achieve with a less structured approach.
#### Tactic: For dialogue applications that require very long conversations, summarize or filter previous dialogue
Since models have a fixed context length, dialogue between a user and an assistant in which the entire conversation is included in the context window cannot continue indefinitely.
There are various workarounds to this problem, one of which is to summarize previous turns in the conversation. Once the size of the input reaches a predetermined threshold length, this could trigger a query that summarizes part of the conversation and the summary of the prior conversation could be included as part of the system message. Alternatively, prior conversation could be summarized asynchronously in the background throughout the entire conversation.
An alternative solution is to dynamically select previous parts of the conversation that are most relevant to the current query. See the tactic ["Use embeddings-based search to implement efficient knowledge retrieval"](/docs/guides/prompt-engineering/tactic-use-embeddings-based-search-to-implement-efficient-knowledge-retrieval).
#### Tactic: Summarize long documents piecewise and construct a full summary recursively
Since models have a fixed context length, they cannot be used to summarize a text longer than the context length minus the length of the generated summary in a single query.
To summarize a very long document such as a book we can use a sequence of queries to summarize each section of the document. Section summaries can be concatenated and summarized producing summaries of summaries. This process can proceed recursively until an entire document is summarized. If its necessary to use information about earlier sections in order to make sense of later sections, then a further trick that can be useful is to include a running summary of the text that precedes any given point in the book while summarizing content at that point. The effectiveness of this procedure for summarizing books has been studied in previous [research](https://openai.com/research/summarizing-books) by OpenAI using variants of GPT-3.
### Strategy: Give models time to "think"
#### Tactic: Instruct the model to work out its own solution before rushing to a conclusion
Sometimes we get better results when we explicitly instruct the model to reason from first principles before coming to a conclusion. Suppose for example we want a model to evaluate a students solution to a math problem. The most obvious way to approach this is to simply ask the model if the student's solution is correct or not.
```example-chat link=/playground/p/default-rushing-to-a-conclusion
SYSTEM: Determine if the student's solution is correct or not.
USER: Problem Statement: I'm building a solar power installation and I need help working out the financials.
- Land costs $100 / square foot
- I can buy solar panels for $250 / square foot
- I negotiated a contract for maintenance that will cost me a flat $100k per year, and an additional $10 / square foot
What is the total cost for the first year of operations as a function of the number of square feet.
Student's Solution: Let x be the size of the installation in square feet.
1. Land cost: 100x
2. Solar panel cost: 250x
3. Maintenance cost: 100,000 + 100x
Total cost: 100x + 250x + 100,000 + 100x = 450x + 100,000
ASSISTANT: The student's solution is correct.
```
But the student's solution is actually not correct! We can get the model to successfully notice this by prompting the model to generate its own solution first.
```example-chat link=/playground/p/default-avoid-rushing-to-a-conclusion
SYSTEM: First work out your own solution to the problem. Then compare your solution to the student's solution and evaluate if the student's solution is correct or not. Don't decide if the student's solution is correct until you have done the problem yourself.
USER: Problem Statement: I'm building a solar power installation and I need help working out the financials.
- Land costs $100 / square foot
- I can buy solar panels for $250 / square foot
- I negotiated a contract for maintenance that will cost me a flat $100k per year, and an additional $10 / square foot
What is the total cost for the first year of operations as a function of the number of square feet.
Student's Solution: Let x be the size of the installation in square feet.
1. Land cost: 100x
2. Solar panel cost: 250x
3. Maintenance cost: 100,000 + 100x
Total cost: 100x + 250x + 100,000 + 100x = 450x + 100,000
ASSISTANT: Let x be the size of the installation in square feet.
1. Land cost: 100x
2. Solar panel cost: 250x
3. Maintenance cost: 100,000 + 10x
Total cost: 100x + 250x + 100,000 + 10x = 360x + 100,000
The student's solution is incorrect. They made an error in the maintenance cost calculation, using 100x instead of 10x. The correct total cost for the first year of operations is 360x + 100,000.
```
#### Tactic: Use inner monologue or a sequence of queries to hide the model's reasoning process
The previous tactic demonstrates that it is sometimes important for the model to reason in detail about a problem before answering a specific question. For some applications, the reasoning process that a model uses to arrive at a final answer would be inappropriate to share with the user. For example, in tutoring applications we may want to encourage students to work out their own answers, but a models reasoning process about the students solution could reveal the answer to the student.
Inner monologue is a tactic that can be used to mitigate this. The idea of inner monologue is to instruct the model to put parts of the output that are meant to be hidden from the user into a structured format that makes parsing them easy. Then before presenting the output to the user, the output is parsed and only part of the output is made visible.
```example-chat link=/playground/p/default-inner-monologue
SYSTEM: Follow these steps to answer the user queries.
Step 1 - First work out your own solution to the problem. Don't rely on the student's solution since it may be incorrect. Enclose all your work for this step within triple quotes (""").
Step 2 - Compare your solution to the student's solution and evaluate if the student's solution is correct or not. Enclose all your work for this step within triple quotes (""").
Step 3 - If the student made a mistake, determine what hint you could give the student without giving away the answer. Enclose all your work for this step within triple quotes (""").
Step 4 - If the student made a mistake, provide the hint from the previous step to the student (outside of triple quotes). Instead of writing "Step 4 - ..." write "Hint:".
USER: Problem Statement:
Student Solution:
```
Alternatively, this can be achieved with a sequence of queries in which all except the last have their output hidden from the end user.
First, we can ask the model to solve the problem on its own. Since this initial query doesn't require the students solution, it can be omitted. This provides the additional advantage that there is no chance that the models solution will be biased by the students attempted solution.
```example-chat link=/playground/p/default-query-sequence-1
USER:
```
Next, we can have the model use all available information to assess the correctness of the students solution.
```example-chat link=/playground/p/default-query-sequence-2
SYSTEM: Compare your solution to the student's solution and evaluate if the student's solution is correct or not.
USER: Problem statement: """"""
Your solution: """"""
Students solution: """"""
```
Finally, we can let the model use its own analysis to construct a reply in the persona of a helpful tutor.
```example-chat link=/playground/p/default-query-sequence-3
SYSTEM: You are a math tutor. If the student made an error, offer a hint to the student in a way that does not reveal the answer. If the student did not make an error, simply offer them an encouraging comment.
USER: Problem statement: """"""
Your solution: """"""
Students solution: """"""
Analysis: """"""
```
#### Tactic: Ask the model if it missed anything on previous passes
Suppose that we are using a model to list excerpts from a source which are relevant to a particular question. After listing each excerpt the model needs to determine if it should start writing another or if it should stop. If the source document is large, it is common for a model to stop too early and fail to list all relevant excerpts. In that case, better performance can often be obtained by prompting the model with followup queries to find any excerpts it missed on previous passes.
```example-chat link=/playground/p/default-2nd-pass
SYSTEM: You will be provided with a document delimited by triple quotes. Your task is to select excerpts which pertain to the following question: "What significant paradigm shifts have occurred in the history of artificial intelligence."
Ensure that excerpts contain all relevant context needed to interpret them - in other words don't extract small snippets that are missing important context. Provide output in JSON format as follows:
[{"excerpt": "..."},
...
{"excerpt": "..."}]
USER: """"""
ASSISTANT: [{"excerpt": "the model writes an excerpt here"},
...
{"excerpt": "the model writes another excerpt here"}]
USER: Are there more relevant excerpts? Take care not to repeat excerpts. Also ensure that excerpts contain all relevant context needed to interpret them - in other words don't extract small snippets that are missing important context.
```
### Strategy: Use external tools
#### Tactic: Use embeddings-based search to implement efficient knowledge retrieval
A model can leverage external sources of information if provided as part of its input. This can help the model to generate more informed and up-to-date responses. For example, if a user asks a question about a specific movie, it may be useful to add high quality information about the movie (e.g. actors, director, etc…) to the models input. Embeddings can be used to implement efficient knowledge retrieval, so that relevant information can be added to the model input dynamically at run-time.
A text embedding is a vector that can measure the relatedness between text strings. Similar or relevant strings will be closer together than unrelated strings. This fact, along with the existence of fast vector search algorithms means that embeddings can be used to implement efficient knowledge retrieval. In particular, a text corpus can be split up into chunks, and each chunk can be embedded and stored. Then a given query can be embedded and vector search can be performed to find the embedded chunks of text from the corpus that are most related to the query (i.e. closest together in the embedding space).
Example implementations can be found in the [OpenAI Cookbook](https://cookbook.openai.com/examples/vector_databases/readme). See the tactic [“Instruct the model to use retrieved knowledge to answer queries”](/docs/guides/prompt-engineering/tactic-instruct-the-model-to-answer-using-a-reference-text) for an example of how to use knowledge retrieval to minimize the likelihood that a model will make up incorrect facts.
#### Tactic: Use code execution to perform more accurate calculations or call external APIs
Language models cannot be relied upon to perform arithmetic or long calculations accurately on their own. In cases where this is needed, a model can be instructed to write and run code instead of making its own calculations. In particular, a model can be instructed to put code that is meant to be run into a designated format such as triple backtick. After an output is produced, the code can be extracted and run. Finally, if necessary, the output from the code execution engine (i.e. Python interpreter) can be provided as an input to the model for the next query.
````example-chat link=/playground/p/default-code-execution
SYSTEM: You can write and execute Python code by enclosing it in triple backticks, e.g. ```code goes here```. Use this to perform calculations.
USER: Find all real-valued roots of the following polynomial: 3*x**5 - 5*x**4 - 3*x**3 - 7*x - 10.
````
Another good use case for code execution is calling external APIs. If a model is instructed in the proper use of an API, it can write code that makes use of it. A model can be instructed in how to use an API by providing it with documentation and/or code samples showing how to use the API.
````example-chat link=/playground/p/default-code-api
SYSTEM: You can write and execute Python code by enclosing it in triple backticks. Also note that you have access to the following module to help users send messages to their friends:
```python
import message
message.write(to="John", message="Hey, want to meetup after work?")```
````
**WARNING: Executing code produced by a model is not inherently safe and precautions should be taken in any application that seeks to do this. In particular, a sandboxed code execution environment is needed to limit the harm that untrusted code could cause.**
#### Tactic: Give the model access to specific functions
The Chat Completions API allows passing a list of function descriptions in requests. This enables models to generate function arguments according to the provided schemas. Generated function arguments are returned by the API in JSON format and can be used to execute function calls. Output provided by function calls can then be fed back into a model in the following request to close the loop. This is the recommended way of using OpenAI models to call external functions. To learn more see the [function calling section](/docs/guides/function-calling) in our introductory text generation guide and more [function calling examples](https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models) in the OpenAI Cookbook.
### Strategy: Test changes systematically
Sometimes it can be hard to tell whether a change — e.g., a new instruction or a new design — makes your system better or worse. Looking at a few examples may hint at which is better, but with small sample sizes it can be hard to distinguish between a true improvement or random luck. Maybe the change helps performance on some inputs, but hurts performance on others.
Evaluation procedures (or "evals") are useful for optimizing system designs. Good evals are:
- Representative of real-world usage (or at least diverse)
- Contain many test cases for greater statistical power (see table below for guidelines)
- Easy to automate or repeat
| Difference to detect | Sample size needed for 95% confidence |
| -------------------- | ------------------------------------- |
| 30% | ~10 |
| 10% | ~100 |
| 3% | ~1,000 |
| 1% | ~10,000 |
Evaluation of outputs can be done by computers, humans, or a mix. Computers can automate evals with objective criteria (e.g., questions with single correct answers) as well as some subjective or fuzzy criteria, in which model outputs are evaluated by other model queries. [OpenAI Evals](https://github.com/openai/evals) is an open-source software framework that provides tools for creating automated evals.
Model-based evals can be useful when there exists a range of possible outputs that would be considered equally high in quality (e.g. for questions with long answers). The boundary between what can be realistically evaluated with a model-based eval and what requires a human to evaluate is fuzzy and is constantly shifting as models become more capable. We encourage experimentation to figure out how well model-based evals can work for your use case.
#### Tactic: Evaluate model outputs with reference to gold-standard answers
Suppose it is known that the correct answer to a question should make reference to a specific set of known facts. Then we can use a model query to count how many of the required facts are included in the answer.
For example, using the following system message:
```example-chat
SYSTEM: You will be provided with text delimited by triple quotes that is supposed to be the answer to a question. Check if the following pieces of information are directly contained in the answer:
- Neil Armstrong was the first person to walk on the moon.
- The date Neil Armstrong first walked on the moon was July 21, 1969.
For each of these points perform the following steps:
1 - Restate the point.
2 - Provide a citation from the answer which is closest to this point.
3 - Consider if someone reading the citation who doesn't know the topic could directly infer the point. Explain why or why not before making up your mind.
4 - Write "yes" if the answer to 3 was yes, otherwise write "no".
Finally, provide a count of how many "yes" answers there are. Provide this count as {"count": }.
```
Here's an example input where both points are satisfied:
```example-chat link=/playground/p/default-model-based-eval-1
SYSTEM:
USER: """Neil Armstrong is famous for being the first human to set foot on the Moon. This historic event took place on July 21, 1969, during the Apollo 11 mission."""
```
Here's an example input where only one point is satisfied:
```example-chat link=/playground/p/default-model-based-eval-2
SYSTEM:
USER: """Neil Armstrong made history when he stepped off the lunar module, becoming the first person to walk on the moon."""
```
Here's an example input where none are satisfied:
```example-chat link=/playground/p/default-model-based-eval-3
SYSTEM:
USER: """In the summer of '69, a voyage grand,
Apollo 11, bold as legend's hand.
Armstrong took a step, history unfurled,
"One small step," he said, for a new world."""
```
There are many possible variants on this type of model-based eval. Consider the following variation which tracks the kind of overlap between the candidate answer and the gold-standard answer, and also tracks whether the candidate answer contradicts any part of the gold-standard answer.
```example-chat
SYSTEM: Use the following steps to respond to user inputs. Fully restate each step before proceeding. i.e. "Step 1: Reason...".
Step 1: Reason step-by-step about whether the information in the submitted answer compared to the expert answer is either: disjoint, equal, a subset, a superset, or overlapping (i.e. some intersection but not subset/superset).
Step 2: Reason step-by-step about whether the submitted answer contradicts any aspect of the expert answer.
Step 3: Output a JSON object structured like: {"type_of_overlap": "disjoint" or "equal" or "subset" or "superset" or "overlapping", "contradiction": true or false}
```
Here's an example input with a substandard answer which nonetheless does not contradict the expert answer:
```example-chat link=/playground/p/default-model-based-eval-4
SYSTEM:
USER: Question: """What event is Neil Armstrong most famous for and on what date did it occur? Assume UTC time."""
Submitted Answer: """Didn't he walk on the moon or something?"""
Expert Answer: """Neil Armstrong is most famous for being the first person to walk on the moon. This historic event occurred on July 21, 1969."""
```
Here's an example input with answer that directly contradicts the expert answer:
```example-chat link=/playground/p/default-model-based-eval-5
SYSTEM:
USER: Question: """What event is Neil Armstrong most famous for and on what date did it occur? Assume UTC time."""
Submitted Answer: """On the 21st of July 1969, Neil Armstrong became the second person to walk on the moon, following after Buzz Aldrin."""
Expert Answer: """Neil Armstrong is most famous for being the first person to walk on the moon. This historic event occurred on July 21, 1969."""
```
Here's an example input with a correct answer that also provides a bit more detail than is necessary:
```example-chat link=/playground/p/default-model-based-eval-6
SYSTEM:
USER: Question: """What event is Neil Armstrong most famous for and on what date did it occur? Assume UTC time."""
Submitted Answer: """At approximately 02:56 UTC on July 21st 1969, Neil Armstrong became the first human to set foot on the lunar surface, marking a monumental achievement in human history."""
Expert Answer: """Neil Armstrong is most famous for being the first person to walk on the moon. This historic event occurred on July 21, 1969."""
```
## Other resources
For more inspiration, visit the [OpenAI Cookbook](https://cookbook.openai.com), which contains example code and also links to third-party resources such as:
- [Prompting libraries & tools](https://cookbook.openai.com/related_resources#prompting-libraries--tools)
- [Prompting guides](https://cookbook.openai.com/related_resources#prompting-guides)
- [Video courses](https://cookbook.openai.com/related_resources#video-courses)
- [Papers on advanced prompting to improve reasoning](https://cookbook.openai.com/related_resources#papers-on-advanced-prompting-to-improve-reasoning)

@ -0,0 +1,205 @@
Python is a popular programming language that is commonly used for data applications, web development, and many other programming tasks due to its ease of use. OpenAI provides a custom [Python library](https://github.com/openai/openai-python) which makes working with the OpenAI API in Python simple and efficient.
## Step 1: Setting up Python
To use the OpenAI Python library, you will need to ensure you have Python installed. Some computers come with Python pre-installed while others require that you set it up yourself. To test if you have Python installed, you can navigate to your Terminal or Command line:
- MacOS: **Open Terminal**: You can find it in the Applications folder or search for it using Spotlight (Command + Space).
- Windows: **Open Command Prompt**: You can find it by searching "cmd" in the start menu.
Next, enter the word `python` and then press return/enter. If you enter into the Python interpreter, then you have Python installed on your computer already and you can go to the next step. If you get an error message that says something like "Error: command python not found", you likely need to install Python and make it available in your terminal / command line.
To download Python, head to the [official Python website](https://www.python.org/downloads/) and download the latest version. To use the OpenAI Python library, you need at least Python 3.7.1 or newer. If you are installing Python for the first time, you can follow the [official Python installation guide for beginners](https://wiki.python.org/moin/BeginnersGuide/Download).
Once you have Python installed, it is a good practice to create a virtual python environment to install the OpenAI Python library. Virtual environments provide a clean working space for your Python packages to be installed so that you do not have conflicts with other libraries you install for other projects. You are not required to use a virtual environment, so skip to step 3 if you do not want to set one up.
To create a virtual environment, Python supplies a built in [venv module](https://docs.python.org/3/tutorial/venv.html) which provides the basic functionality needed for the virtual environment. Running the command below will create a virtual environment named "openai-env" inside the current folder you have selected in your terminal / command line:
```
python -m venv openai-env
```
Once youve created the virtual environment, you need to activate it. On Windows, run:
```
openai-env\Scripts\activate
```
On Unix or MacOS, run:
```
source openai-env/bin/activate
```
You should see the terminal / command line interface change slightly after you active the virtual environment, it should now show "openai-env" to the left of the cursor input section. For more details on working wit virtual environments, please refer to the [official Python documentation](https://docs.python.org/3/tutorial/venv.html#creating-virtual-environments).
Once you have Python 3.7.1 or newer installed and (optionally) set up a virtual environment, the OpenAI Python library can be installed. From the terminal / command line, run:
```
pip install --upgrade openai
```
Once this completes, running `pip list` will show you the Python libraries you have installed in your current environment, which should confirm that the OpenAI Python library was successfully installed.
## Step 2: Set up your API key
The main advantage to making your API key accessible for all projects is that the Python library will automatically detect it and use it without having to write any code.
<Expander
label="MacOS"
autoScroll
showCollapse
>
1. **Open Terminal**: You can find it in the Applications folder or search for it using Spotlight (Command + Space).
2. **Edit Bash Profile**: Use the command `nano ~/.bash_profile` or `nano ~/.zshrc` (for newer MacOS versions) to open the profile file in a text editor.
3. **Add Environment Variable**: In the editor, add the line below, replacing `your-api-key-here` with your actual API key:
```
export OPENAI_API_KEY='your-api-key-here'
```
4. **Save and Exit**: Press Ctrl+O to write the changes, followed by Ctrl+X to close the editor.
5. **Load Your Profile**: Use the command `source ~/.bash_profile` or `source ~/.zshrc` to load the updated profile.
6. **Verification**: Verify the setup by typing `echo $OPENAI_API_KEY` in the terminal. It should display your API key.
<Expander
label="Windows"
autoScroll
showCollapse
>
1. **Open Command Prompt**: You can find it by searching "cmd" in the start menu.
2. **Set environment variable in the current session**: To set the environment variable in the current session, use the command below, replacing `your-api-key-here` with your actual API key:
```
setx OPENAI_API_KEY "your-api-key-here"
```
This command will set the OPENAI_API_KEY environment variable for the current session.
3. **Permanent setup**: To make the setup permanent, add the variable through the system properties as follows:
- Right-click on 'This PC' or 'My Computer' and select 'Properties'.
- Click on 'Advanced system settings'.
- Click the 'Environment Variables' button.
- In the 'System variables' section, click 'New...' and enter OPENAI_API_KEY as the variable name and your API key as the variable value.
4. **Verification**: To verify the setup, reopen the command prompt and type the command below. It should display your API key:
`echo %OPENAI_API_KEY%`
If you only want your API key to be accessible to a single project, you can create a local `.env` file which contains the API key and then explicitly use that API key with the Python code shown in the steps to come.
Start by going to the project folder you want to create the `.env` file in.
In order for your .env file to be ignored by version control, create
a .gitignore file in the root of your project directory. Add a line
with .env on it which will make sure your API key or other secrets
are not accidentally shared via version control.
Once you create the `.gitignore` and `.env` files using the terminal or an integrated development environment (IDE), copy your secret API key and set it as the `OPENAI_API_KEY` in your `.env` file. If you haven't created a secret key yet, you can do so on the [API key page](/account/api-keys).
The `.env` file should look like the following:
```
# Once you add your API key below, make sure to not share it with anyone! The API key should remain private.
OPENAI_API_KEY=abc123
```
The API key can be imported by running the code below:
```python
from openai import OpenAI
client = OpenAI()
# defaults to getting the key using os.environ.get("OPENAI_API_KEY")
# if you saved the key under a different environment variable name, you can do something like:
# client = OpenAI(
# api_key=os.environ.get("CUSTOM_ENV_NAME"),
# )
```
## Step 3: Sending your first API request
After you have Python configured and set up an API key, the final step is to send a request to the OpenAI API using the Python library. To do this, create a file named `openai-test.py` using th terminal or an IDE.
Inside the file, copy and paste one of the examples below:
<CodeSample
defaultLanguage="ChatCompletions"
code={{
ChatCompletions: `
from openai import OpenAI
client = OpenAI()\n
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
{"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
]
)\n
print(completion.choices[0].message)
`.trim(),
Embeddings: `
from openai import OpenAI
client = OpenAI()\n
response = client.embeddings.create(
model="text-embedding-ada-002",
input="The food was delicious and the waiter..."
)\n
print(response)
`.trim(),
Images: `
from openai import OpenAI
client = OpenAI()\n
response = client.images.generate(
prompt="A cute baby sea otter",
n=2,
size="1024x1024"
)\n
print(response)
`.trim(),
}}
/>
To run the code, enter `python openai-test.py` into the terminal / command line.
The [Chat Completions](/docs/api-reference/chat/create) example highlights just one area of strength for our models: creative ability. Explaining recursion (the programming topic) in a well formatted poem is something both the best developers and best poets would struggle with. In this case, `gpt-3.5-turbo` does it effortlessly.

@ -0,0 +1,79 @@
# Release notes
Keep track of updates to OpenAI GPTs. You can also view all of the broader [ChatGPT releases](https://help.openai.com/en/articles/6825453-chatgpt-release-notes) which is used to share new features and capabilities. This page is maintained in a best effort fashion and may not reflect all changes
being made.
### May 13th, 2024
-
Actions can{" "}
return
{" "}
of up to 10 files per request to be integrated into the conversation
### April 8th, 2024
-
Files created by Code Interpreter can now be{" "}
included
{" "}
in POST requests
### Mar 18th, 2024
-
GPT Builders can view and restore previous versions of their GPTs
### Mar 15th, 2024
-
POST requests can{" "}
include up to ten files
{" "}
(including DALL-E generated images) from the conversation
### Feb 22nd, 2024
-
Users can now rate GPTs, which provides feedback for builders and signal for other
users in the Store
-
Users can now leave private feedback for Builders if/when they opt in
-
Every GPT now has an About page with information about the GPT including Rating,
Category, Conversation Count, Starter Prompts, and more
-
Builders can now link their social profiles from Twitter, LinkedIn, and GitHub to
their GPT
### Jan 10th, 2024
-
The{" "}
GPT Store
{" "}
launched publicly, with categories and various leaderboards
### Nov 6th, 2023
-
GPTs
{" "}
allow users to customize ChatGPT for various use cases and share these with other
users

@ -0,0 +1,84 @@
# Safety best practices
### Use our free Moderation API
OpenAI's [Moderation API](/docs/guides/moderation) is free-to-use and can help reduce the frequency of unsafe content in your completions. Alternatively, you may wish to develop your own content filtration system tailored to your use case.
### Adversarial testing
We recommend “red-teaming” your application to ensure it's robust to adversarial input. Test your product over a wide range of inputs and user behaviors, both a representative set and those reflective of someone trying to break' your application. Does it wander off topic? Can someone easily redirect the feature via prompt injections, e.g. “ignore the previous instructions and do this instead”?
### Human in the loop (HITL)
Wherever possible, we recommend having a human review outputs before they are used in practice. This is especially critical in high-stakes domains, and for code generation. Humans should be aware of the limitations of the system, and have access to any information needed to verify the outputs (for example, if the application summarizes notes, a human should have easy access to the original notes to refer back).
### Prompt engineering
“Prompt engineering” can help constrain the topic and tone of output text. This reduces the chance of producing undesired content, even if a user tries to produce it. Providing additional context to the model (such as by giving a few high-quality examples of desired behavior prior to the new input) can make it easier to steer model outputs in desired directions.
### “Know your customer” (KYC)
Users should generally need to register and log-in to access your service. Linking this service to an existing account, such as a Gmail, LinkedIn, or Facebook log-in, may help, though may not be appropriate for all use-cases. Requiring a credit card or ID card reduces risk further.
### Constrain user input and limit output tokens
Limiting the amount of text a user can input into the prompt helps avoid prompt injection. Limiting the number of output tokens helps reduce the chance of misuse.
Narrowing the ranges of inputs or outputs, especially drawn from trusted sources, reduces the extent of misuse possible within an application.
Allowing user inputs through validated dropdown fields (e.g., a list of movies on Wikipedia) can be more secure than allowing open-ended text inputs.
Returning outputs from a validated set of materials on the backend, where possible, can be safer than returning novel generated content (for instance, routing a customer query to the best-matching existing customer support article, rather than attempting to answer the query from-scratch).
### Allow users to report issues
Users should generally have an easily-available method for reporting improper functionality or other concerns about application behavior (listed email address, ticket submission method, etc). This method should be monitored by a human and responded to as appropriate.
### Understand and communicate limitations
From hallucinating inaccurate information, to offensive outputs, to bias, and much more, language models may not be suitable for every use case without significant modifications. Consider whether the model is fit for your purpose, and evaluate the performance of the API on a wide range of potential inputs in order to identify cases where the API's performance might drop. Consider your customer base and the range of inputs that they will be using, and ensure their expectations are calibrated appropriately.
Safety and security are very important to us at OpenAI.
If in the course of your development you do notice any safety or security issues with the API or anything else related to OpenAI, please submit these through our [Coordinated Vulnerability Disclosure Program](https://openai.com/security/disclosure/).
## End-user IDs
Sending end-user IDs in your requests can be a useful tool to help OpenAI monitor and detect abuse. This allows OpenAI to provide your team with more actionable feedback in the event that we detect any policy violations in your application.
The IDs should be a string that uniquely identifies each user. We recommend hashing their username or email address, in order to avoid sending us any identifying information. If you offer a preview of your product to non-logged in users, you can send a session ID instead.
You can include end-user IDs in your API requests via the `user` parameter as follows:
<CodeSample
title="Example: Providing a user identifer"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.completions.create(
model="gpt-3.5-turbo-instruct",
prompt="This is a test",
max_tokens=5,
user="user_123456"
)
`.trim(),
curl: `
curl https://api.openai.com/v1/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-3.5-turbo-instruct",
"prompt": "This is a test",
"max_tokens": 5,
"user": "user123456"
}'
`.trim(),
}}
/>

@ -0,0 +1,353 @@
# Speech to text
Learn how to turn audio into text
## Introduction
The Audio API provides two speech to text endpoints, `transcriptions` and `translations`, based on our state-of-the-art open source large-v2 [Whisper model](https://openai.com/blog/whisper/). They can be used to:
- Transcribe audio into whatever language the audio is in.
- Translate and transcribe the audio into english.
File uploads are currently limited to 25 MB and the following input file types are supported: `mp3`, `mp4`, `mpeg`, `mpga`, `m4a`, `wav`, and `webm`.
## Quickstart
### Transcriptions
The transcriptions API takes as input the audio file you want to transcribe and the desired output file format for the transcription of the audio. We currently support multiple input and output file formats.
<CodeSample
title="Transcribe audio"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
audio_file= open("/path/to/file/audio.mp3", "rb")
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
print(transcription.text)
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const transcription = await openai.audio.transcriptions.create({
file: fs.createReadStream("/path/to/file/audio.mp3"),
model: "whisper-1",
});\n
console.log(transcription.text);
}
main();
`.trim(),
curl: `
curl --request POST \\
--url https://api.openai.com/v1/audio/transcriptions \\
--header "Authorization: Bearer $OPENAI_API_KEY" \\
--header 'Content-Type: multipart/form-data' \\
--form file=@/path/to/file/audio.mp3 \\
--form model=whisper-1
`.trim(),
}}
/>
By default, the response type will be json with the raw text included.
```example-content
{
"text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger.
....
}
```
The Audio API also allows you to set additional parameters in a request. For example, if you want to set the `response_format` as `text`, your request would look like the following:
<CodeSample
title="Additional options"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
audio_file = open("/path/to/file/speech.mp3", "rb")
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text"
)
print(transcription.text)
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const transcription = await openai.audio.transcriptions.create({
file: fs.createReadStream("/path/to/file/speech.mp3"),
model: "whisper-1",
response_format: "text",
});\n
console.log(transcription.text);
}
main();
`.trim(),
curl: `
curl --request POST \\
--url https://api.openai.com/v1/audio/transcriptions \\
--header "Authorization: Bearer $OPENAI_API_KEY" \\
--header 'Content-Type: multipart/form-data' \\
--form file=@/path/to/file/speech.mp3 \\
--form model=whisper-1 \\
--form response_format=text
`.trim(),
}}
/>
The [API Reference](/docs/api-reference/audio) includes the full list of available parameters.
### Translations
The translations API takes as input the audio file in any of the supported languages and transcribes, if necessary, the audio into English. This differs from our /Transcriptions endpoint since the output is not in the original input language and is instead translated to English text.
<CodeSample
title="Translate audio"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
audio_file= open("/path/to/file/german.mp3", "rb")
translation = client.audio.translations.create(
model="whisper-1",
file=audio_file
)
print(translation.text)
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const translation = await openai.audio.translations.create({
file: fs.createReadStream("/path/to/file/german.mp3"),
model: "whisper-1",
});\n
console.log(translation.text);
}
main();
`.trim(),
curl: `
curl --request POST \\
--url https://api.openai.com/v1/audio/translations \\
--header "Authorization: Bearer $OPENAI_API_KEY" \\
--header 'Content-Type: multipart/form-data' \\
--form file=@/path/to/file/german.mp3 \\
--form model=whisper-1
`.trim(),
}}
/>
In this case, the inputted audio was german and the outputted text looks like:
```example-content
Hello, my name is Wolfgang and I come from Germany. Where are you heading today?
```
We only support translation into English at this time.
## Supported languages
We currently [support the following languages](https://github.com/openai/whisper#available-models-and-languages) through both the `transcriptions` and `translations` endpoint:
Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
While the underlying model was trained on 98 languages, we only list the languages that exceeded <50% [word error rate](https://en.wikipedia.org/wiki/Word_error_rate) (WER) which is an industry standard benchmark for speech to text model accuracy. The model will return results for languages not listed above but the quality will be low.
## Timestamps
By default, the Whisper API will output a transcript of the provided audio in text. The [`timestamp_granularities[]` parameter](/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities) enables a more structured and timestamped json output format, with timestamps at the segment, word level, or both. This enables word-level precision for transcripts and video edits, which allows for the removal of specific frames tied to individual words.
<CodeSample
title="Timestamp options"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
audio_file = open("speech.mp3", "rb")
transcript = client.audio.transcriptions.create(
file=audio_file,
model="whisper-1",
response_format="verbose_json",
timestamp_granularities=["word"]
)\n
print(transcript.words)
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const transcription = await openai.audio.transcriptions.create({
file: fs.createReadStream("audio.mp3"),
model: "whisper-1",
response_format: "verbose_json",
timestamp_granularities: ["word"]
});\n
console.log(transcription.text);
}
main();
`.trim(),
curl: `
curl https://api.openai.com/v1/audio/transcriptions \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: multipart/form-data" \\
-F file="@/path/to/file/audio.mp3" \\
-F "timestamp_granularities[]=word" \\
-F model="whisper-1" \\
-F response_format="verbose_json"
`.trim(),
}}
/>
## Longer inputs
By default, the Whisper API only supports files that are less than 25 MB. If you have an audio file that is longer than that, you will need to break it up into chunks of 25 MB's or less or used a compressed audio format. To get the best performance, we suggest that you avoid breaking the audio up mid-sentence as this may cause some context to be lost.
One way to handle this is to use the [PyDub open source Python package](https://github.com/jiaaro/pydub) to split the audio:
```python
from pydub import AudioSegment
song = AudioSegment.from_mp3("good_morning.mp3")
# PyDub handles time in milliseconds
ten_minutes = 10 * 60 * 1000
first_10_minutes = song[:ten_minutes]
first_10_minutes.export("good_morning_10.mp3", format="mp3")
```
_OpenAI makes no guarantees about the usability or security of 3rd party software like PyDub._
## Prompting
You can use a [prompt](/docs/api-reference/audio/createTranscription#audio/createTranscription-prompt) to improve the quality of the transcripts generated by the Whisper API. The model will try to match the style of the prompt, so it will be more likely to use capitalization and punctuation if the prompt does too. However, the current prompting system is much more limited than our other language models and only provides limited control over the generated audio. Here are some examples of how prompting can help in different scenarios:
1. Prompts can be very helpful for correcting specific words or acronyms that the model may misrecognize in the audio. For example, the following prompt improves the transcription of the words DALL·E and GPT-3, which were previously written as "GDP 3" and "DALI": "The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity"
2. To preserve the context of a file that was split into segments, you can prompt the model with the transcript of the preceding segment. This will make the transcript more accurate, as the model will use the relevant information from the previous audio. The model will only consider the final 224 tokens of the prompt and ignore anything earlier. For multilingual inputs, Whisper uses a custom tokenizer. For English only inputs, it uses the standard GPT-2 tokenizer which are both accessible through the open source [Whisper Python package](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L361).
3. Sometimes the model might skip punctuation in the transcript. You can avoid this by using a simple prompt that includes punctuation: "Hello, welcome to my lecture."
4. The model may also leave out common filler words in the audio. If you want to keep the filler words in your transcript, you can use a prompt that contains them: "Umm, let me think like, hmm... Okay, here's what I'm, like, thinking."
5. Some languages can be written in different ways, such as simplified or traditional Chinese. The model might not always use the writing style that you want for your transcript by default. You can improve this by using a prompt in your preferred writing style.
## Improving reliability
As we explored in the prompting section, one of the most common challenges faced when using Whisper is the model often does not recognize uncommon words or acronyms. To address this, we have highlighted different techniques which improve the reliability of Whisper in these cases:
The first method involves using the optional prompt parameter to pass a dictionary of the correct spellings.
Since it wasn't trained using instruction-following techniques, Whisper operates more like a base GPT model. It's important to keep in mind that Whisper only considers the first 244 tokens of the prompt.
<CodeSample
title="Prompt parameter"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
audio_file = open("/path/to/file/speech.mp3", "rb")
transcription = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file,
response_format="text",
prompt="ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T."
)
print(transcription.text)
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const transcription = await openai.audio.transcriptions.create({
file: fs.createReadStream("/path/to/file/speech.mp3"),
model: "whisper-1",
response_format: "text",
prompt:"ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T.",
});\n
console.log(transcription.text);
}
main();
`.trim(),
}}
/>
While it will increase reliability, this technique is limited to only 244 characters so your list of SKUs would need to be relatively small in order for this to be a scalable solution.
The second method involves a post-processing step using GPT-4 or GPT-3.5-Turbo.
We start by providing instructions for GPT-4 through the `system_prompt` variable. Similar to what we did with the prompt parameter earlier, we can define our company and product names.
<CodeSample
title="Post-processing"
defaultLanguage="python"
code={{
python: `
system_prompt = "You are a helpful assistant for the company ZyntriQix. Your task is to correct any spelling discrepancies in the transcribed text. Make sure that the names of the following products are spelled correctly: ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided."\n
def generate_corrected_transcript(temperature, system_prompt, audio_file):
response = client.chat.completions.create(
model="gpt-4o",
temperature=temperature,
messages=[
{
"role": "system",
"content": system_prompt
},
{
"role": "user",
"content": transcribe(audio_file, "")
}
]
)
return completion.choices[0].message.content\n
corrected_text = generate_corrected_transcript(0, system_prompt, fake_company_filepath)
`.trim(),
node: `
const systemPrompt = "You are a helpful assistant for the company ZyntriQix. Your task is to correct any spelling discrepancies in the transcribed text. Make sure that the names of the following products are spelled correctly: ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T. Only add necessary punctuation such as periods, commas, and capitalization, and use only the context provided.";\n
async function generateCorrectedTranscript(temperature, systemPrompt, audioFile) {
const transcript = await transcribe(audioFile);
const completion = await openai.chat.completions.create({
model: "gpt-4o",
temperature: temperature,
messages: [
{
role: "system",
content: systemPrompt
},
{
role: "user",
content: transcript
}
]
});
return completion.choices[0].message.content;
}\n
const fakeCompanyFilepath = "path/to/audio/file";
generateCorrectedTranscript(0, systemPrompt, fakeCompanyFilepath)
.then(correctedText => console.log(correctedText))
.catch(error => console.error(error));
`.trim(),
}}
/>
If you try this on your own audio file, you can see that GPT-4 manages to correct many misspellings in the transcript. Due to its larger context window, this method might be more scalable than using Whisper's prompt parameter and is more reliable since GPT-4 can be instructed and guided in ways that aren't possible with Whisper given the lack of instruction following.

@ -0,0 +1,195 @@
# Supported countries and territories
We provide below a list of the countries and territories in which we currently support access to our API services.
Accessing or offering access to our services outside of the countries and territories listed below may result in your account being blocked or suspended.
- Albania
- Algeria
- Afghanistan
- Andorra
- Angola
- Antigua and Barbuda
- Argentina
- Armenia
- Australia
- Austria
- Azerbaijan
- Bahamas
- Bahrain
- Bangladesh
- Barbados
- Belgium
- Belize
- Benin
- Bhutan
- Bolivia
- Bosnia and Herzegovina
- Botswana
- Brazil
- Brunei
- Bulgaria
- Burkina Faso
- Burundi
- Cabo Verde
- Cambodia
- Cameroon
- Canada
- Central African Republic
- Chad
- Chile
- Colombia
- Comoros
- Congo (Brazzaville)
- Congo (DRC)
- Costa Rica
- Côte d'Ivoire
- Croatia
- Cyprus
- Czechia (Czech Republic)
- Denmark
- Djibouti
- Dominica
- Dominican Republic
- Ecuador
- Egypt
- El Salvador
- Equatorial Guinea
- Eritrea
- Estonia
- Eswatini (Swaziland)
- Ethiopia
- Fiji
- Finland
- France
- Gabon
- Gambia
- Georgia
- Germany
- Ghana
- Greece
- Grenada
- Guatemala
- Guinea
- Guinea-Bissau
- Guyana
- Haiti
- Holy See (Vatican City)
- Honduras
- Hungary
- Iceland
- India
- Indonesia
- Iraq
- Ireland
- Israel
- Italy
- Jamaica
- Japan
- Jordan
- Kazakhstan
- Kenya
- Kiribati
- Kuwait
- Kyrgyzstan
- Laos
- Latvia
- Lebanon
- Lesotho
- Liberia
- Libya
- Liechtenstein
- Lithuania
- Luxembourg
- Madagascar
- Malawi
- Malaysia
- Maldives
- Mali
- Malta
- Marshall Islands
- Mauritania
- Mauritius
- Mexico
- Micronesia
- Moldova
- Monaco
- Mongolia
- Montenegro
- Morocco
- Mozambique
- Myanmar
- Namibia
- Nauru
- Nepal
- Netherlands
- New Zealand
- Nicaragua
- Niger
- Nigeria
- North Macedonia
- Norway
- Oman
- Pakistan
- Palau
- Palestine
- Panama
- Papua New Guinea
- Paraguay
- Peru
- Philippines
- Poland
- Portugal
- Qatar
- Romania
- Rwanda
- Saint Kitts and Nevis
- Saint Lucia
- Saint Vincent and the Grenadines
- Samoa
- San Marino
- Sao Tome and Principe
- Saudi Arabia
- Senegal
- Serbia
- Seychelles
- Sierra Leone
- Singapore
- Slovakia
- Slovenia
- Solomon Islands
- Somalia
- South Africa
- South Korea
- South Sudan
- Spain
- Sri Lanka
- Suriname
- Sweden
- Switzerland
- Sudan
- Taiwan
- Tajikistan
- Tanzania
- Thailand
- Timor-Leste (East Timor)
- Togo
- Tonga
- Trinidad and Tobago
- Tunisia
- Turkey
- Turkmenistan
- Tuvalu
- Uganda
- Ukraine (with certain exceptions)
- United Arab Emirates
- United Kingdom
- United States of America
- Uruguay
- Uzbekistan
- Vanuatu
- Vietnam
- Yemen
- Zambia
- Zimbabwe

@ -0,0 +1,565 @@
# Text generation models
OpenAI's text generation models (often called generative pre-trained transformers or large language models) have been trained to understand natural language, code, and images. The models provide text outputs in response to their inputs. The text inputs to these models are also referred to as "prompts". Designing a prompt is essentially how you “program” a large language model model, usually by providing instructions or some examples of how to successfully complete a task.
Using OpenAI's text generation models, you can build applications to:
- Draft documents
- Write computer code
- Answer questions about a knowledge base
- Analyze texts
- Give software a natural language interface
- Tutor in a range of subjects
- Translate languages
- Simulate characters for games
---
<IconItem
icon={}
color="green"
title="Try GPT-4o"
className="mt-6"
>
Try out GPT-4o in the playground.
<IconItem
icon={}
color="purple"
title="Explore GPT-4o with image inputs"
className="mt-6"
>
Check out the vision guide for more detail.
---
To use one of these models via the OpenAI API, youll send a request to the Chat Completions API containing the inputs and your API key, and receive a response containing the models output.
You can experiment with various models in the [chat playground](https://platform.openai.com/playground?mode=chat). If youre not sure which model to use then try `gpt-4o` if you need high intelligence or `gpt-3.5-turbo` if you need the fastest speed and lowest cost.
## Chat Completions API
Chat models take a list of messages as input and return a model-generated message as output. Although the chat format is designed to make multi-turn conversations easy, its just as useful for single-turn tasks without any conversation.
An example Chat Completions API call looks like the following:
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
{"role": "user", "content": "Where was it played?"}
]
)
`.trim(),
"node.js": `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const completion = await openai.chat.completions.create({
messages: [{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Who won the world series in 2020?"},
{"role": "assistant", "content": "The Los Angeles Dodgers won the World Series in 2020."},
{"role": "user", "content": "Where was it played?"}],
model: "gpt-3.5-turbo",
});\n
console.log(completion.choices[0]);
}
main();
`.trim(),
curl: `
curl https://api.openai.com/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-3.5-turbo",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Who won the world series in 2020?"
},
{
"role": "assistant",
"content": "The Los Angeles Dodgers won the World Series in 2020."
},
{
"role": "user",
"content": "Where was it played?"
}
]
}'
`.trim(),
}}
/>
To learn more, you can view the full [API reference documentation](https://platform.openai.com/docs/api-reference/chat) for the Chat API.
The main input is the messages parameter. Messages must be an array of message objects, where each object has a role (either "system", "user", or "assistant") and content. Conversations can be as short as one message or many back and forth turns.
Typically, a conversation is formatted with a system message first, followed by alternating user and assistant messages.
The system message helps set the behavior of the assistant. For example, you can modify the personality of the assistant or provide specific instructions about how it should behave throughout the conversation. However note that the system message is optional and the models behavior without a system message is likely to be similar to using a generic message such as "You are a helpful assistant."
The user messages provide requests or comments for the assistant to respond to. Assistant messages store previous assistant responses, but can also be written by you to give examples of desired behavior.
Including conversation history is important when user instructions refer to prior messages. In the example above, the users final question of "Where was it played?" only makes sense in the context of the prior messages about the World Series of 2020. Because the models have no memory of past requests, all relevant information must be supplied as part of the conversation history in each request. If a conversation cannot fit within the models token limit, it will need to be [shortened](/docs/guides/prompt-engineering/tactic-for-dialogue-applications-that-require-very-long-conversations-summarize-or-filter-previous-dialogue) in some way.
To mimic the effect seen in ChatGPT where the text is returned iteratively, set the{" "}
stream parameter to
true.
### Chat Completions response format
An example Chat Completions API response looks as follows:
```
{
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "The 2020 World Series was played in Texas at Globe Life Field in Arlington.",
"role": "assistant"
},
"logprobs": null
}
],
"created": 1677664795,
"id": "chatcmpl-7QyqpwdfhqwajicIEznoc6Q47XAyW",
"model": "gpt-3.5-turbo-0613",
"object": "chat.completion",
"usage": {
"completion_tokens": 17,
"prompt_tokens": 57,
"total_tokens": 74
}
}
```
The assistants reply can be extracted with:
<CodeSample
defaultLanguage="python"
code={{
python: `
completion.choices[0].message.content
`.trim(),
"node.js": `
completion.choices[0].message.content
`.trim(),
}}
/>
Every response will include a `finish_reason`. The possible values for `finish_reason` are:
- `stop`: API returned complete message, or a message terminated by one of the stop sequences provided via the [stop](/docs/api-reference/chat/create#chat/create-stop) parameter
- `length`: Incomplete model output due to [`max_tokens`](/docs/api-reference/chat/create#chat/create-max_tokens) parameter or token limit
- `function_call`: The model decided to call a function
- `content_filter`: Omitted content due to a flag from our content filters
- `null`: API response still in progress or incomplete
Depending on input parameters, the model response may include different information.
JSON mode
A common way to use Chat Completions is to instruct the model to always return a JSON object that makes sense for your use case, by specifying this in the system message. While this does work in some cases, occasionally the models may generate output that does not parse to valid JSON objects.
To prevent these errors and improve model performance, when using `gpt-4o`, `gpt-4-turbo`, or `gpt-3.5-turbo`, you can set [response_format](/docs/api-reference/chat/create#chat-create-response_format) to `{ "type": "json_object" }` to enable JSON mode. When JSON mode is enabled, the model is constrained to only generate strings that parse into valid JSON object.
Important notes:
- When using JSON mode, **always** instruct the model to produce JSON via some message in the conversation, for example via your system message. If you don't include an explicit instruction to generate JSON, the model may generate an unending stream of whitespace and the request may run continually until it reaches the token limit. To help ensure you don't forget, the API will throw an error if the string `"JSON"` does not appear somewhere in the context.
- The JSON in the message the model returns may be partial (i.e. cut off) if `finish_reason` is `length`, which indicates the generation exceeded `max_tokens` or the conversation exceeded the token limit. To guard against this, check `finish_reason` before parsing the response.
- JSON mode will not guarantee the output matches any specific schema, only that it is valid and parses without errors.
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.chat.completions.create(
model="gpt-3.5-turbo-0125",
response_format={ "type": "json_object" },
messages=[
{"role": "system", "content": "You are a helpful assistant designed to output JSON."},
{"role": "user", "content": "Who won the world series in 2020?"}
]
)
print(response.choices[0].message.content)
`.trim(),
"node.js": `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const completion = await openai.chat.completions.create({
messages: [
{
role: "system",
content: "You are a helpful assistant designed to output JSON.",
},
{ role: "user", content: "Who won the world series in 2020?" },
],
model: "gpt-3.5-turbo-0125",
response_format: { type: "json_object" },
});
console.log(completion.choices[0].message.content);
}\n
main();
`.trim(),
curl: `
curl https://api.openai.com/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-3.5-turbo-0125",
"response_format": { "type": "json_object" },
"messages": [
{
"role": "system",
"content": "You are a helpful assistant designed to output JSON."
},
{
"role": "user",
"content": "Who won the world series in 2020?"
}
]
}'
`.trim(),
}}
/>
In this example, the response includes a JSON object that looks something like the following:
```json
"content": "{\"winner\": \"Los Angeles Dodgers\"}"`
```
Note that JSON mode is always enabled when the model is generating arguments as part of [function calling](/docs/guides/function-calling).
Reproducible outputs{" "}
Beta
Chat Completions are non-deterministic by default (which means model outputs may differ from request to request). That being said, we offer some control towards deterministic outputs by giving you access to the [seed](/docs/api-reference/chat/create#chat-create-seed) parameter and the [system_fingerprint](/docs/api-reference/completions/object#completions/object-system_fingerprint) response field.
To receive (mostly) deterministic outputs across API calls, you can:
- Set the [seed](/docs/api-reference/chat/create#chat-create-seed) parameter to any integer of your choice and use the same value across requests you'd like deterministic outputs for.
- Ensure all other parameters (like `prompt` or `temperature`) are the exact same across requests.
Sometimes, determinism may be impacted due to necessary changes OpenAI makes to model configurations on our end. To help you keep track of these changes, we expose the [system_fingerprint](/docs/api-reference/chat/object#chat/object-system_fingerprint) field. If this value is different, you may see different outputs due to changes we've made on our systems.
<a
href="https://cookbook.openai.com/examples/reproducible_outputs_with_the_seed_parameter"
target="_blank"
>
<IconItem
icon={}
color="purple"
title="Deterministic outputs"
className="mt-6"
>
Explore the new seed parameter in the OpenAI cookbook
## Managing tokens
Language models read and write text in chunks called tokens. In English, a token can be as short as one character or as long as one word (e.g., `a` or ` apple`), and in some languages tokens can be even shorter than one character or even longer than one word.
For example, the string `"ChatGPT is great!"` is encoded into six tokens: `["Chat", "G", "PT", " is", " great", "!"]`.
The total number of tokens in an API call affects:
- How much your API call costs, as you pay per token
- How long your API call takes, as writing more tokens takes more time
- Whether your API call works at all, as total tokens must be below the models maximum limit (4097 tokens for `gpt-3.5-turbo`)
Both input and output tokens count toward these quantities. For example, if your API call used 10 tokens in the message input and you received 20 tokens in the message output, you would be billed for 30 tokens. Note however that for some models the price per token is different for tokens in the input vs. the output (see the [pricing](https://openai.com/pricing) page for more information).
To see how many tokens are used by an API call, check the `usage` field in the API response (e.g., `response['usage']['total_tokens']`).
Chat models like `gpt-3.5-turbo` and `gpt-4-turbo-preview` use tokens in the same way as the models available in the completions API, but because of their message-based formatting, it's more difficult to count how many tokens will be used by a conversation.
Below is an example function for counting tokens for messages passed to `gpt-3.5-turbo-0613`.
The exact way that messages are converted into tokens may change from model to model. So when future model versions are released, the answers returned by this function may be only approximate.
```python
def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0613"):
"""Returns the number of tokens used by a list of messages."""
try:
encoding = tiktoken.encoding_for_model(model)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
if model == "gpt-3.5-turbo-0613": # note: future models may deviate from this
num_tokens = 0
for message in messages:
num_tokens += 4 # every message follows {role/name}\n{content}\n
for key, value in message.items():
num_tokens += len(encoding.encode(value))
if key == "name": # if there's a name, the role is omitted
num_tokens += -1 # role is always required and always 1 token
num_tokens += 2 # every reply is primed with assistant
return num_tokens
else:
raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.""")
```
Next, create a message and pass it to the function defined above to see the token count, this should match the value returned by the API usage parameter:
```python
messages = [
{"role": "system", "content": "You are a helpful, pattern-following assistant that translates corporate jargon into plain English."},
{"role": "system", "name":"example_user", "content": "New synergies will help drive top-line growth."},
{"role": "system", "name": "example_assistant", "content": "Things working well together will increase revenue."},
{"role": "system", "name":"example_user", "content": "Let's circle back when we have more bandwidth to touch base on opportunities for increased leverage."},
{"role": "system", "name": "example_assistant", "content": "Let's talk later when we're less busy about how to do better."},
{"role": "user", "content": "This late pivot means we don't have time to boil the ocean for the client deliverable."},
]
model = "gpt-3.5-turbo-0613"
print(f"{num_tokens_from_messages(messages, model)} prompt tokens counted.")
# Should show ~126 total_tokens
```
To confirm the number generated by our function above is the same as what the API returns, create a new Chat Completion:
```python
# example token count from the OpenAI API
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model=model,
messages=messages,
temperature=0,
)
print(f'{response.usage.prompt_tokens} prompt tokens used.')
```
To see how many tokens are in a text string without making an API call, use OpenAIs [tiktoken](https://github.com/openai/tiktoken) Python library. Example code can be found in the OpenAI Cookbooks guide on [how to count tokens with tiktoken](https://cookbook.openai.com/examples/how_to_count_tokens_with_tiktoken).
Each message passed to the API consumes the number of tokens in the content, role, and other fields, plus a few extra for behind-the-scenes formatting. This may change slightly in the future.
If a conversation has too many tokens to fit within a models maximum limit (e.g., more than 4097 tokens for `gpt-3.5-turbo` or more than 128k tokens for `gpt-4o`), you will have to truncate, omit, or otherwise shrink your text until it fits. Beware that if a message is removed from the messages input, the model will lose all knowledge of it.
Note that very long conversations are more likely to receive incomplete replies. For example, a `gpt-3.5-turbo` conversation that is 4090 tokens long will have its reply cut off after just 6 tokens.
## Parameter details
### Frequency and presence penalties
The frequency and presence penalties found in the [Chat Completions API](/docs/api-reference/chat/create) and [Legacy Completions API](/docs/api-reference/completions) can be used to reduce the likelihood of sampling repetitive sequences of tokens.
They work by directly modifying the logits (un-normalized log-probabilities) with an additive contribution.
```python
mu[j] -> mu[j] - c[j] * alpha_frequency - float(c[j] > 0) * alpha_presence
```
Where:
- `mu[j]` is the logits of the j-th token
- `c[j]` is how often that token was sampled prior to the current position
- `float(c[j] > 0)` is 1 if `c[j] > 0` and 0 otherwise
- `alpha_frequency` is the frequency penalty coefficient
- `alpha_presence` is the presence penalty coefficient
As we can see, the presence penalty is a one-off additive contribution that applies to all tokens that have been sampled at least once and the frequency penalty is a contribution that is proportional to how often a particular token has already been sampled.
Reasonable values for the penalty coefficients are around 0.1 to 1 if the aim is to just reduce repetitive samples somewhat. If the aim is to strongly suppress repetition, then one can increase the coefficients up to 2, but this can noticeably degrade the quality of samples. Negative values can be used to increase the likelihood of repetition.
### Token log probabilities
The [logprobs](/docs/api-reference/chat/create#chat-create-logprobs) parameter found in the [Chat Completions API](/docs/api-reference/chat/create) and [Legacy Completions API](/docs/api-reference/completions), when requested, provides the log probabilities of each output token, and a limited number of the most likely tokens at each token position alongside their log probabilities. This can be useful in some cases to assess the confidence of the model in its output, or to examine alternative responses the model might have given.
Completions API Legacy
The completions API endpoint received its final update in July 2023 and has a different interface than the new chat completions endpoint. Instead of the input being a list of messages, the input is a freeform text string called a `prompt`.
An example legacy Completions API call looks like the following:
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()\n
response = client.completions.create(
model="gpt-3.5-turbo-instruct",
prompt="Write a tagline for an ice cream shop."
)
`.trim(),
"node.js": `
const completion = await openai.completions.create({
model: 'gpt-3.5-turbo-instruct',
prompt: 'Write a tagline for an ice cream shop.'
});
`.trim(),
}}
/>
See the full [API reference documentation](https://platform.openai.com/docs/api-reference/completions) to learn more.
#### Inserting text
The completions endpoint also supports inserting text by providing a [suffix](/docs/api-reference/completions/create#completions-create-suffix) in addition to the standard prompt which is treated as a prefix. This need naturally arises when writing long-form text, transitioning between paragraphs, following an outline, or guiding the model towards an ending. This also works on code, and can be used to insert in the middle of a function or file.
To illustrate how suffix context effects generated text, consider the prompt, “Today I decided to make a big change.” Theres many ways one could imagine completing the sentence. But if we now supply the ending of the story: “Ive gotten many compliments on my new hair!”, the intended completion becomes clear.
> I went to college at Boston University. After getting my degree, I decided to make a change**. A big change!**
> **I packed my bags and moved to the west coast of the United States.**
> Now, I cant get enough of the Pacific Ocean!
By providing the model with additional context, it can be much more steerable. However, this is a more constrained and challenging task for the model. To get the best results, we recommend the following:
**Use `max_tokens` > 256.** The model is better at inserting longer completions. With too small `max_tokens`, the model may be cut off before it's able to connect to the suffix. Note that you will only be charged for the number of tokens produced even when using larger `max_tokens`.
**Prefer `finish_reason` == "stop".** When the model reaches a natural stopping point or a user provided stop sequence, it will set `finish_reason` as "stop". This indicates that the model has managed to connect to the suffix well and is a good signal for the quality of a completion. This is especially relevant for choosing between a few completions when using n > 1 or resampling (see the next point).
**Resample 3-5 times.** While almost all completions connect to the prefix, the model may struggle to connect the suffix in harder cases. We find that resampling 3 or 5 times (or using best_of with k=3,5) and picking the samples with "stop" as their `finish_reason` can be an effective way in such cases. While resampling, you would typically want a higher temperatures to increase diversity.
Note: if all the returned samples have `finish_reason` == "length", it's likely that max_tokens is too small and model runs out of tokens before it manages to connect the prompt and the suffix naturally. Consider increasing `max_tokens` before resampling.
**Try giving more clues.** In some cases to better help the models generation, you can provide clues by giving a few examples of patterns that the model can follow to decide a natural place to stop.
> How to make a delicious hot chocolate:
>
> 1.** Boil water**
> **2. Put hot chocolate in a cup**
> **3. Add boiling water to the cup**
> 4. Enjoy the hot chocolate
> 1. Dogs are loyal animals.
> 2. Lions are ferocious animals.
> 3. Dolphins** are playful animals.**
> 4. Horses are majestic animals.
### Completions response format
An example completions API response looks as follows:
```
{
"choices": [
{
"finish_reason": "length",
"index": 0,
"logprobs": null,
"text": "\n\n\"Let Your Sweet Tooth Run Wild at Our Creamy Ice Cream Shack"
}
],
"created": 1683130927,
"id": "cmpl-7C9Wxi9Du4j1lQjdjhxBlO22M61LD",
"model": "gpt-3.5-turbo-instruct",
"object": "text_completion",
"usage": {
"completion_tokens": 16,
"prompt_tokens": 10,
"total_tokens": 26
}
}
```
In Python, the output can be extracted with `response['choices'][0]['text']`.
The response format is similar to the response format of the Chat Completions API.
## Chat Completions vs. Completions
The Chat Completions format can be made similar to the completions format by constructing a request using a single user message. For example, one can translate from English to French with the following completions prompt:
```
Translate the following English text to French: "{text}"
```
And an equivalent chat prompt would be:
```
[{"role": "user", "content": 'Translate the following English text to French: "{text}"'}]
```
Likewise, the completions API can be used to simulate a chat between a user and an assistant by formatting the input [accordingly](https://platform.openai.com/playground/p/default-chat?model=gpt-3.5-turbo-instruct).
The difference between these APIs is the underlying models that are available in each. The chat completions API is the interface to our most capable model (`gpt-4o`), and our most cost effective model (`gpt-3.5-turbo`).
### Prompt engineering
An awareness of the best practices for working with OpenAI models can make a significant difference in application performance. The failure modes that each exhibit and the ways of working around or correcting those failure modes are not always intuitive. There is an entire field related to working with language models which has come to be known as "prompt engineering", but as the field has progressed its scope has outgrown merely engineering the prompt into engineering systems that use model queries as components. To learn more, read our guide on [prompt engineering](/docs/guides/prompt-engineering) which covers methods to improve model reasoning, reduce the likelihood of model hallucinations, and more. You can also find many useful resources including code samples in the [OpenAI Cookbook](https://cookbook.openai.com).
## FAQ
### Which model should I use?
We generally recommend that you default to using either `gpt-4o`, `gpt-4-turbo`, or `gpt-3.5-turbo`. If your use case requires high intelligence or reasoning about images as well as text, we recommend you evaluate both `gpt-4o` and `gpt-4-turbo` (although they have very similar intelligence, note that `gpt-4o` is both faster and cheaper). If your use case requires the fastest speed and lowest cost, we recommend `gpt-3.5-turbo` since it is optimized for these aspects.
`gpt-4o` and `gpt-4-turbo` are also less likely than `gpt-3.5-turbo` to make up information, a behavior known as "hallucination". Finally, `gpt-4o` and `gpt-4-turbo` have a context window that supports up to 128,000 tokens compared to 4,096 tokens for `gpt-3.5-turbo`, meaning they can reason over much more information at once.
We recommend experimenting in the [playground](https://platform.openai.com/playground?mode=chat) to investigate which models provide the best price performance trade-off for your usage. A common design pattern is to use several distinct query types which are each dispatched to the model appropriate to handle them.
### How should I set the temperature parameter?
Lower values for temperature result in more consistent outputs (e.g. 0.2), while higher values generate more diverse and creative results (e.g. 1.0). Select a temperature value based on the desired trade-off between coherence and creativity for your specific application. The temperature can range is from 0 to 2.
### Is fine-tuning available for the latest models?
See the [fine-tuning guide](/docs/guides/fine-tuning) for the latest information on which models are available for fine-tuning and how to get started.
### Do you store the data that is passed into the API?
As of March 1st, 2023, we retain your API data for 30 days but no longer use your data sent via the API to improve our models. Learn more in our [data usage policy](https://openai.com/policies/usage-policies). Some endpoints offer [zero retention](/docs/models/default-usage-policies-by-endpoint).
### How can I make my application more safe?
If you want to add a moderation layer to the outputs of the Chat API, you can follow our [moderation guide](/docs/guides/moderation) to prevent content that violates OpenAIs usage policies from being shown. We also encourage you to read our [safety guide](/docs/guides/safety-best-practices) for more information on how to build safer systems.
### Should I use ChatGPT or the API?
[ChatGPT](https://chatgpt.com) offers a chat interface for our models and a range of built-in features such as integrated browsing, code execution, plugins, and more. By contrast, using OpenAIs API provides more flexibility but requires that you write code or send the requests to our models programmatically.

@ -0,0 +1,157 @@
# Text to speech
Learn how to turn text into lifelike spoken audio
## Introduction
The Audio API provides a [`speech`](/docs/api-reference/audio/createSpeech) endpoint based on our [TTS (text-to-speech) model](/docs/models/tts). It comes with 6 built-in voices and can be used to:
- Narrate a written blog post
- Produce spoken audio in multiple languages
- Give real time audio output using streaming
Here is an example of the `alloy` voice:
Please note that our{" "}
usage policies require you to
provide a clear disclosure to end users that the TTS voice they are hearing is
AI-generated and not a human voice.
## Quick start
The `speech` endpoint takes in three key inputs: the [model](/docs/api-reference/audio/createSpeech#audio-createspeech-model), the [text](/docs/api-reference/audio/createSpeech#audio-createspeech-input) that should be turned into audio, and the [voice](/docs/api-reference/audio/createSpeech#audio-createspeech-voice) to be used for the audio generation. A simple request would look like the following:
<CodeSample
title="Generate spoken audio from input text"
defaultLanguage="python"
code={{
python: `
from pathlib import Path
from openai import OpenAI
client = OpenAI()\n
speech_file_path = Path(__file__).parent / "speech.mp3"
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input="Today is a wonderful day to build something people love!"
)\n
response.stream_to_file(speech_file_path)
`.trim(),
curl: `
curl https://api.openai.com/v1/audio/speech \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "Content-Type: application/json" \\
-d '{
"model": "tts-1",
"input": "Today is a wonderful day to build something people love!",
"voice": "alloy"
}' \\
--output speech.mp3
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
const speechFile = path.resolve("./speech.mp3");\n
async function main() {
const mp3 = await openai.audio.speech.create({
model: "tts-1",
voice: "alloy",
input: "Today is a wonderful day to build something people love!",
});
console.log(speechFile);
const buffer = Buffer.from(await mp3.arrayBuffer());
await fs.promises.writeFile(speechFile, buffer);
}
main();
`.trim(),
}}
/>
By default, the endpoint will output a MP3 file of the spoken audio but it can also be configured to output any of our [supported formats](/docs/guides/text-to-speech/supported-output-formats).
## Audio quality
For real-time applications, the standard `tts-1` model provides the lowest latency but at a lower quality than the `tts-1-hd` model. Due to the way the audio is generated, `tts-1` is likely to generate content that has more static in certain situations than `tts-1-hd`. In some cases, the audio may not have noticeable differences depending on your listening device and the individual person.
## Voice options
Experiment with different voices (`alloy`, `echo`, `fable`, `onyx`, `nova`, and `shimmer`) to find one that matches your desired tone and audience. The current voices are optimized for English.
### Alloy
### Echo
### Fable
### Onyx
### Nova
### Shimmer
## Supported output formats
The default response format is "mp3", but other formats like "opus", "aac", "flac", and "pcm" are available.
- **Opus**: For internet streaming and communication, low latency.
- **AAC**: For digital audio compression, preferred by YouTube, Android, iOS.
- **FLAC**: For lossless audio compression, favored by audio enthusiasts for archiving.
- **WAV**: Uncompressed WAV audio, suitable for low-latency applications to avoid decoding overhead.
- **PCM**: Similar to WAV but containing the raw samples in 24kHz (16-bit signed, low-endian), without the header.
## Supported languages
The TTS model generally follows the Whisper model in terms of language support. Whisper [supports the following languages](https://github.com/openai/whisper#available-models-and-languages) and performs well despite the current voices being optimized for English:
Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh.
You can generate spoken audio in these languages by providing the input text in the language of your choice.
## Streaming real time audio
The Speech API provides support for real time audio streaming using [chunk transfer encoding](https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Transfer-Encoding). This means that the audio is able to be played before the full file has been generated and made accessible.
```python
from openai import OpenAI
client = OpenAI()
response = client.audio.speech.create(
model="tts-1",
voice="alloy",
input="Hello world! This is a streaming test.",
)
response.stream_to_file("output.mp3")
```
## FAQ
### How can I control the emotional range of the generated audio?
There is no direct mechanism to control the emotional output of the audio generated. Certain factors may influence the output audio like capitalization or grammar but our internal tests with these have yielded mixed results.
### Can I create a custom copy of my own voice?
No, this is not something we support.
### Do I own the outputted audio files?
Yes, like with all outputs from our API, the person who created them owns the output. You are still required to inform end users that they are hearing audio generated by AI and not a real person talking to them.

@ -0,0 +1,20 @@
#### Tier 5 rate limits
This is a high level summary and there are per-model exceptions to these limits (e.g. some legacy models or models with larger context windows have different rate limits). To view the exact rate limits per model for your account, visit the [limits](/account/rate-limits) section of your account settings.
| Model | RPM | TPM | Batch Queue Limit |
| ------------------------ | ----------- | ---------- | ----------------- |
| `gpt-4o` | 10,000 | 12,000,000 | 1,800,000,000 |
| `gpt-4-turbo` | 10,000 | 2,000,000 | 300,000,000 |
| `gpt-4` | 10,000 | 300,000 | 45,000,000 |
| `gpt-3.5-turbo` | 10,000 | 2,000,000 | 300,000,000 |
| `text-embedding-3-large` | 10,000 | 10,000,000 | 4,000,000,000 |
| `text-embedding-3-small` | 10,000 | 10,000,000 | 4,000,000,000 |
| `text-embedding-ada-002` | 10,000 | 10,000,000 | 4,000,000,000 |
| `whisper-1` | 500 | - | - |
| `tts-1` | 500 | - | - |
| `tts-1-hd` | 20 | - | - |
| `dall-e-2` | 500 img/min | - | - |
| `dall-e-3` | 75 img/min | - | - |
If you need higher rate limits than this, you can request an increase in the [limits](/account/rate-limits) section of your account settings.

@ -0,0 +1,18 @@
#### Tier 4 rate limits
This is a high level summary and there are per-model exceptions to these limits (e.g. some legacy models or models with larger context windows have different rate limits). To view the exact rate limits per model for your account, visit the [limits](/account/rate-limits) section of your account settings.
| Model | RPM | TPM | Batch Queue Limit |
| ------------------------ | ----------- | --------- | ----------------- |
| `gpt-4o` | 10,000 | 800,000 | 80,000,000 |
| `gpt-4-turbo` | 10,000 | 800,000 | 80,000,000 |
| `gpt-4` | 10,000 | 300,000 | 30,000,000 |
| `gpt-3.5-turbo` | 10,000 | 1,000,000 | 100,000,000 |
| `text-embedding-3-large` | 10,000 | 5,000,000 | 500,000,000 |
| `text-embedding-3-small` | 10,000 | 5,000,000 | 500,000,000 |
| `text-embedding-ada-002` | 10,000 | 5,000,000 | 500,000,000 |
| `whisper-1` | 100 | - | - |
| `tts-1` | 100 | - | - |
| `tts-1-hd` | 10 | - | - |
| `dall-e-2` | 100 img/min | - | - |
| `dall-e-3` | 15 img/min | - | - |

@ -0,0 +1,14 @@
#### Free tier rate limits
This is a high level summary and there are per-model exceptions to these limits (e.g. some legacy models or models with larger context windows have different rate limits). To view the exact rate limits per model for your account, visit the [limits](/account/rate-limits) section of your account settings.
| Model | RPM | RPD | TPM | Batch Queue Limit |
| ------------------------ | --------- | --- | --------- | ----------------- |
| `gpt-3.5-turbo` | 3 | 200 | 40,000 | 200,000 |
| `text-embedding-3-large` | 3,000 | 200 | 1,000,000 | 3,000,000 |
| `text-embedding-3-small` | 3,000 | 200 | 1,000,000 | 3,000,000 |
| `text-embedding-ada-002` | 3,000 | 200 | 1,000,000 | 3,000,000 |
| `whisper-1` | 3 | 200 | - | - |
| `tts-1` | 3 | 200 | - | - |
| `dall-e-2` | 5 img/min | - | - | - |
| `dall-e-3` | 1 img/min | - | - | - |

@ -0,0 +1,18 @@
#### Tier 1 rate limits
This is a high level summary and there are per-model exceptions to these limits (e.g. some legacy models or models with larger context windows have different rate limits). To view the exact rate limits per model for your account, visit the [limits](/account/rate-limits) section of your account settings.
| Model | RPM | RPD | TPM | Batch Queue Limit |
| ------------------------ | --------- | ------ | --------- | ----------------- |
| `gpt-4o` | 500 | - | 30,000 | 90,000 |
| `gpt-4-turbo` | 500 | - | 30,000 | 90,000 |
| `gpt-4` | 500 | 10,000 | 10,000 | 100,000 |
| `gpt-3.5-turbo` | 3,500 | 10,000 | 60,000 | 200,000 |
| `text-embedding-3-large` | 3,000 | - | 1,000,000 | 3,000,000 |
| `text-embedding-3-small` | 3,000 | - | 1,000,000 | 3,000,000 |
| `text-embedding-ada-002` | 3,000 | - | 1,000,000 | 3,000,000 |
| `whisper-1` | 50 | - | - | - |
| `tts-1` | 50 | - | - | - |
| `tts-1-hd` | 3 | - | - | - |
| `dall-e-2` | 5 img/min | - | - | - |
| `dall-e-3` | 5 img/min | - | - | - |

@ -0,0 +1,18 @@
#### Tier 3 rate limits
This is a high level summary and there are per-model exceptions to these limits (e.g. some legacy models or models with larger context windows have different rate limits). To view the exact rate limits per model for your account, visit the [limits](/account/rate-limits) section of your account settings.
| Model | RPM | TPM | Batch Queue Limit |
| ------------------------ | ----------- | --------- | ----------------- |
| `gpt-4o` | 5,000 | 600,000 | 40,000,000 |
| `gpt-4-turbo` | 5,000 | 600,000 | 40,000,000 |
| `gpt-4` | 5,000 | 80,000 | 5,000,000 |
| `gpt-3.5-turbo` | 3,500 | 160,000 | 10,000,000 |
| `text-embedding-3-large` | 5,000 | 5,000,000 | 100,000,000 |
| `text-embedding-3-small` | 5,000 | 5,000,000 | 100,000,000 |
| `text-embedding-ada-002` | 5,000 | 5,000,000 | 100,000,000 |
| `whisper-1` | 100 | - | - |
| `tts-1` | 100 | - | - |
| `tts-1-hd` | 7 | - | - |
| `dall-e-2` | 100 img/min | - | - |
| `dall-e-3` | 7 img/min | - | - |

@ -0,0 +1,18 @@
#### Tier 2 rate limits
This is a high level summary and there are per-model exceptions to these limits (e.g. some legacy models or models with larger context windows have different rate limits). To view the exact rate limits per model for your account, visit the [limits](/account/rate-limits) section of your account settings.
| Model | RPM | TPM | Batch Queue Limit |
| ------------------------ | ---------- | --------- | ----------------- |
| `gpt-4o` | 5,000 | 450,000 | 1,350,000 |
| `gpt-4-turbo` | 5,000 | 450,000 | 1,350,000 |
| `gpt-4` | 5,000 | 40,000 | 200,000 |
| `gpt-3.5-turbo` | 3,500 | 80,000 | 400,000 |
| `text-embedding-3-large` | 5,000 | 1,000,000 | 20,000,000 |
| `text-embedding-3-small` | 5,000 | 1,000,000 | 20,000,000 |
| `text-embedding-ada-002` | 5,000 | 1,000,000 | 20,000,000 |
| `whisper-1` | 50 | - | - |
| `tts-1` | 50 | - | - |
| `tts-1-hd` | 5 | - | - |
| `dall-e-2` | 50 img/min | - | - |
| `dall-e-3` | 7 img/min | - | - |

@ -0,0 +1,358 @@
Code Interpreter Beta
Code Interpreter allows Assistants to write and run Python code in a sandboxed execution environment. This tool can process files with diverse data and formatting, and generate files with data and images of graphs. Code Interpreter allows your Assistant to run code iteratively to solve challenging code and math problems. When your Assistant writes code that fails to run, it can iterate on this code by attempting to run different code until the code execution succeeds.
See a quickstart of how to get started with Code Interpreter [here](/docs/assistants/overview/step-1-create-an-assistant?context=with-streaming).
## How it works
Code Interpreter is charged at $0.03 per session. If your Assistant calls Code Interpreter simultaneously in two different threads (e.g., one thread per end-user), two Code Interpreter sessions are created. Each session is active by default for one hour, which means that you only pay for one session per if users interact with Code Interpreter in the same thread for up to one hour.
### Enabling Code Interpreter
Pass `code_interpreter` in the `tools` parameter of the Assistant object to enable Code Interpreter:
<CodeSample
defaultLanguage="python"
code={{
python: `
assistant = client.beta.assistants.create(
instructions="You are a personal math tutor. When asked a math question, write and run code to answer the question.",
model="gpt-4o",
tools=[{"type": "code_interpreter"}]
)
`.trim(),
"node.js": `
const assistant = await openai.beta.assistants.create({
instructions: "You are a personal math tutor. When asked a math question, write and run code to answer the question.",
model: "gpt-4o",
tools: [{"type": "code_interpreter"}]
});
`.trim(),
curl: `
curl https://api.openai.com/v1/assistants \\
-u :$OPENAI_API_KEY \\
-H 'Content-Type: application/json' \\
-H 'OpenAI-Beta: assistants=v2' \\
-d '{
"instructions": "You are a personal math tutor. When asked a math question, write and run code to answer the question.",
"tools": [
{ "type": "code_interpreter" }
],
"model": "gpt-4o"
}'
`.trim(),
}}
/>
The model then decides when to invoke Code Interpreter in a Run based on the nature of the user request. This behavior can be promoted by prompting in the Assistant's `instructions` (e.g., “write code to solve this problem”).
### Passing files to Code Interpreter
Files that are passed at the Assistant level are accessible by all Runs with this Assistant:
<CodeSample
defaultLanguage="python"
code={{
python: `
# Upload a file with an "assistants" purpose
file = client.files.create(
file=open("mydata.csv", "rb"),
purpose='assistants'
)\n
# Create an assistant using the file ID
assistant = client.beta.assistants.create(
instructions="You are a personal math tutor. When asked a math question, write and run code to answer the question.",
model="gpt-4o",
tools=[{"type": "code_interpreter"}],
tool_resources={
"code_interpreter": {
"file_ids": [file.id]
}
}
)
`.trim(),
"node.js": `
// Upload a file with an "assistants" purpose
const file = await openai.files.create({
file: fs.createReadStream("mydata.csv"),
purpose: "assistants",
});\n
// Create an assistant using the file ID
const assistant = await openai.beta.assistants.create({
instructions: "You are a personal math tutor. When asked a math question, write and run code to answer the question.",
model: "gpt-4o",
tools: [{"type": "code_interpreter"}],
tool_resources: {
"code_interpreter": {
"file_ids": [file.id]
}
}
});
`.trim(),
curl: `
# Upload a file with an "assistants" purpose
curl https://api.openai.com/v1/files \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-F purpose="assistants" \\
-F file="@/path/to/mydata.csv"\n
# Create an assistant using the file ID
curl https://api.openai.com/v1/assistants \\
-u :$OPENAI_API_KEY \\
-H 'Content-Type: application/json' \\
-H 'OpenAI-Beta: assistants=v2' \\
-d '{
"instructions": "You are a personal math tutor. When asked a math question, write and run code to answer the question.",
"tools": [{"type": "code_interpreter"}],
"model": "gpt-4o",
"tool_resources": {
"code_interpreter": {
"file_ids": ["file-BK7bzQj3FfZFXr7DbL6xJwfo"]
}
}
}'
`.trim(),
}}
/>
Files can also be passed at the Thread level. These files are only accessible in the specific Thread. Upload the File using the [File upload](/docs/api-reference/files/create) endpoint and then pass the File ID as part of the Message creation request:
<CodeSample
defaultLanguage="python"
code={{
python: `
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": "I need to solve the equation \`3x + 11 = 14\`. Can you help me?",
"attachments": [
{
"file_id": file.id,
"tools": [{"type": "code_interpreter"}]
}
]
}
]
)
`.trim(),
"node.js": `
const thread = await openai.beta.threads.create({
messages: [
{
"role": "user",
"content": "I need to solve the equation \`3x + 11 = 14\`. Can you help me?",
"attachments": [
{
file_id: file.id,
tools: [{type: "code_interpreter"}]
}
]
}
]
});
`.trim(),
curl: `
curl https://api.openai.com/v1/threads/thread_abc123/messages \\
-u :$OPENAI_API_KEY \\
-H 'Content-Type: application/json' \\
-H 'OpenAI-Beta: assistants=v2' \\
-d '{
"role": "user",
"content": "I need to solve the equation \`3x + 11 = 14\`. Can you help me?",
"attachments": [
{
"file_id": "file-ACq8OjcLQm2eIG0BvRM4z5qX",
"tools": [{"type": "code_interpreter"}]
}
]
}'
`.trim(),
}}
/>
Files have a maximum size of 512 MB. Code Interpreter supports a variety of file formats including `.csv`, `.pdf`, `.json` and many more. More details on the file extensions (and their corresponding MIME-types) supported can be found in the [Supported files](/docs/assistants/tools/code-interpreter/supported-files) section below.
### Reading images and files generated by Code Interpreter
Code Interpreter in the API also outputs files, such as generating image diagrams, CSVs, and PDFs. There are two types of files that are generated:
1. Images
2. Data files (e.g. a `csv` file with data generated by the Assistant)
When Code Interpreter generates an image, you can look up and download this file in the `file_id` field of the Assistant Message response:
```json
{
"id": "msg_abc123",
"object": "thread.message",
"created_at": 1698964262,
"thread_id": "thread_abc123",
"role": "assistant",
"content": [
{
"type": "image_file",
"image_file": {
"file_id": "file-abc123"
}
}
]
# ...
}
```
The file content can then be downloaded by passing the file ID to the Files API:
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI\n
client = OpenAI()\n
image_data = client.files.content("file-abc123")
image_data_bytes = image_data.read()\n
with open("./my-image.png", "wb") as file:
file.write(image_data_bytes)
`.trim(),
"node.js": `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const response = await openai.files.content("file-abc123");\n
// Extract the binary data from the Response object
const image_data = await response.arrayBuffer();\n
// Convert the binary data to a Buffer
const image_data_buffer = Buffer.from(image_data);\n
// Save the image to a specific location
fs.writeFileSync("./my-image.png", image_data_buffer);
}\n
main();
`.trim(),
curl: `
curl https://api.openai.com/v1/files/file-abc123/content \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
--output image.png
`.trim(),
}}
/>
When Code Interpreter references a file path (e.g., ”Download this csv file”), file paths are listed as annotations. You can convert these annotations into links to download the file:
```json
{
"id": "msg_abc123",
"object": "thread.message",
"created_at": 1699073585,
"thread_id": "thread_abc123",
"role": "assistant",
"content": [
{
"type": "text",
"text": {
"value": "The rows of the CSV file have been shuffled and saved to a new CSV file. You can download the shuffled CSV file from the following link:\n\n[Download Shuffled CSV File](sandbox:/mnt/data/shuffled_file.csv)",
"annotations": [
{
"type": "file_path",
"text": "sandbox:/mnt/data/shuffled_file.csv",
"start_index": 167,
"end_index": 202,
"file_path": {
"file_id": "file-abc123"
}
}
...
```
### Input and output logs of Code Interpreter
By listing the steps of a Run that called Code Interpreter, you can inspect the code `input` and `outputs` logs of Code Interpreter:
<CodeSample
defaultLanguage="python"
code={{
python: `
run_steps = client.beta.threads.runs.steps.list(
thread_id=thread.id,
run_id=run.id
)
`.trim(),
"node.js": `
const runSteps = await openai.beta.threads.runs.steps.list(
thread.id,
run.id
);
`.trim(),
curl: `
curl https://api.openai.com/v1/threads/thread_abc123/runs/RUN_ID/steps \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "OpenAI-Beta: assistants=v2" \\
`.trim(),
}}
/>
```bash
{
"object": "list",
"data": [
{
"id": "step_abc123",
"object": "thread.run.step",
"type": "tool_calls",
"run_id": "run_abc123",
"thread_id": "thread_abc123",
"status": "completed",
"step_details": {
"type": "tool_calls",
"tool_calls": [
{
"type": "code",
"code": {
"input": "# Calculating 2 + 2\nresult = 2 + 2\nresult",
"outputs": [
{
"type": "logs",
"logs": "4"
}
...
}
```
## Supported files
_For `text/` MIME types, the encoding must be one of `utf-8`, `utf-16`, or `ascii`._
| File format | MIME type |
| ----------- | --------------------------------------------------------------------------- |
| `.c` | `text/x-c` |
| `.cs` | `text/x-csharp` |
| `.cpp` | `text/x-c++` |
| `.doc` | `application/msword` |
| `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` |
| `.html` | `text/html` |
| `.java` | `text/x-java` |
| `.json` | `application/json` |
| `.md` | `text/markdown` |
| `.pdf` | `application/pdf` |
| `.php` | `text/x-php` |
| `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` |
| `.py` | `text/x-python` |
| `.py` | `text/x-script.python` |
| `.rb` | `text/x-ruby` |
| `.tex` | `text/x-tex` |
| `.txt` | `text/plain` |
| `.css` | `text/css` |
| `.js` | `text/javascript` |
| `.sh` | `application/x-sh` |
| `.ts` | `application/typescript` |
| `.csv` | `application/csv` |
| `.jpeg` | `image/jpeg` |
| `.jpg` | `image/jpeg` |
| `.gif` | `image/gif` |
| `.png` | `image/png` |
| `.tar` | `application/x-tar` |
| `.xlsx` | `application/vnd.openxmlformats-officedocument.spreadsheetml.sheet` |
| `.xml` | `application/xml or "text/xml"` |
| `.zip` | `application/zip` |

@ -0,0 +1,616 @@
File Search Beta
File Search augments the Assistant with knowledge from outside its model, such as proprietary product information or documents provided by your users. OpenAI automatically parses and chunks your documents, creates and stores the embeddings, and use both vector and keyword search to retrieve relevant content to answer user queries.
## Quickstart
In this example, well create an assistant that can help answer questions about companies financial statements.
### Step 1: Create a new Assistant with File Search Enabled
Create a new assistant with `file_search` enabled in the `tools` parameter of the Assistant.
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()
assistant = client.beta.assistants.create(
name="Financial Analyst Assistant",
instructions="You are an expert financial analyst. Use you knowledge base to answer questions about audited financial statements.",
model="gpt-4o",
tools=[{"type": "file_search"}],
)`.trim(),
"node.js": `
const openai = new OpenAI();
async function main() {
const assistant = await openai.beta.assistants.create({
name: "Financial Analyst Assistant",
instructions: "You are an expert financial analyst. Use you knowledge base to answer questions about audited financial statements.",
model: "gpt-4o",
tools: [{ type: "file_search" }],
});
}
main();`.trim(),
curl: `
curl https://api.openai.com/v1/assistants \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-H "OpenAI-Beta: assistants=v2" \\
-d '{
"name": "Financial Analyst Assistant",
"instructions": "You are an expert financial analyst. Use you knowledge base to answer questions about audited financial statements.",
"tools": [{"type": "file_search"}],
"model": "gpt-4o"
}'
`.trim(),
}}
/>
Once the `file_search` tool is enabled, the model decides when to retrieve content based on user messages.
### Step 2: Upload files and add them to a Vector Store
To access your files, the `file_search` tool uses the Vector Store object.
Upload your files and create a Vector Store to contain them.
Once the Vector Store is created, you should poll its status until all files are out of the `in_progress` state to
ensure that all content has finished processing. The SDK provides helpers to uploading and polling in one shot.
<CodeSample
defaultLanguage="python"
code={{
python: `
# Create a vector store caled "Financial Statements"
vector_store = client.beta.vector_stores.create(name="Financial Statements")
# Ready the files for upload to OpenAI
file_paths = ["edgar/goog-10k.pdf", "edgar/brka-10k.txt"]
file_streams = [open(path, "rb") for path in file_paths]
# Use the upload and poll SDK helper to upload the files, add them to the vector store,
# and poll the status of the file batch for completion.
file_batch = client.beta.vector_stores.file_batches.upload_and_poll(
vector_store_id=vector_store.id, files=file_streams
)
# You can print the status and the file counts of the batch to see the result of this operation.
print(file_batch.status)
print(file_batch.file_counts)
`.trim(),
"node.js": `
const fileStreams = ["edgar/goog-10k.pdf", "edgar/brka-10k.txt"].map((path) =>
fs.createReadStream(path),
);
// Create a vector store including our two files.
let vectorStore = await openai.beta.vectorStores.create({
name: "Financial Statement",
});
await openai.beta.vectorStores.fileBatches.uploadAndPoll(vectorStore.id, fileStreams)
`.trim(),
}}
/>
### Step 3: Update the assistant to to use the new Vector Store
To make the files accessible to your assistant, update the assistants `tool_resources` with the new `vector_store` id.
<CodeSample
defaultLanguage="python"
code={{
python: `
assistant = client.beta.assistants.update(
assistant_id=assistant.id,
tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)
`.trim(),
"node.js": `
await openai.beta.assistants.update(assistant.id, {
tool_resources: { file_search: { vector_store_ids: [vectorStore.id] } },
});
`.trim(),
}}
/>
### Step 4: Create a thread
You can also attach files as Message attachments on your thread. Doing so will create another `vector_store` associated with the thread, or, if there is already a vector store attached to this thread, attach the new files to the existing thread vector store. When you create a Run on this thread, the file search tool will query both the `vector_store` from your assistant and the `vector_store` on the thread.
In this example, the user attached a copy of Apples latest 10-K filing.
<CodeSample
defaultLanguage="python"
code={{
python: `
# Upload the user provided file to OpenAI
message_file = client.files.create(
file=open("edgar/aapl-10k.pdf", "rb"), purpose="assistants"
)
# Create a thread and attach the file to the message
thread = client.beta.threads.create(
messages=[
{
"role": "user",
"content": "How many shares of AAPL were outstanding at the end of of October 2023?",
# Attach the new file to the message.
"attachments": [
{ "file_id": message_file.id, "tools": [{"type": "file_search"}] }
],
}
]
)
# The thread now has a vector store with that file in its tool resources.
print(thread.tool_resources.file_search)
`.trim(),
"node.js": `
// A user wants to attach a file to a specific message, let's upload it.
const aapl10k = await openai.files.create({
file: fs.createReadStream("edgar/aapl-10k.pdf"),
purpose: "assistants",
});\n
const thread = await openai.beta.threads.create({
messages: [
{
role: "user",
content:
"How many shares of AAPL were outstanding at the end of of October 2023?",
// Attach the new file to the message.
attachments: [{ file_id: aapl10k.id, tools: [{ type: "file_search" }] }],
},
],
});\n
// The thread now has a vector store in its tool resources.
console.log(thread.tool_resources?.file_search);
`.trim(),
}}
/>
Vector stores created using message attachements have a default expiration policy of 7 days after they were last active (defined as the last time the vector store was part of a run). This default exists to help you manage your vector storage costs. You can override these expiration policies at any time. Learn more [here](/docs/assistants/tools/file-search/managing-costs-with-expiration-policies).
### Step 5: Create a run and check the output
Now, create a Run and observe that the model uses the File Search tool to provide a response to the users question.
<ContentSwitcher
options={[
{
value: "streaming",
label: "With streaming",
content: (
<CodeSample
defaultLanguage="python"
code={{
python: `
from typing_extensions import override
from openai import AssistantEventHandler, OpenAI
client = OpenAI()
class EventHandler(AssistantEventHandler):
@override
def on_text_created(self, text) -> None:
print(f"\\nassistant > ", end="", flush=True)\n
@override
def on_tool_call_created(self, tool_call):
print(f"\\nassistant > {tool_call.type}\\n", flush=True)\n
@override
def on_message_done(self, message) -> None:
# print a citation to the file searched
message_content = message.content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
message_content.value = message_content.value.replace(
annotation.text, f"[{index}]"
)
if file_citation := getattr(annotation, "file_citation", None):
cited_file = client.files.retrieve(file_citation.file_id)
citations.append(f"[{index}] {cited_file.filename}")\n
print(message_content.value)
print("\\n".join(citations))\n\n
# Then, we use the stream SDK helper
# with the EventHandler class to create the Run
# and stream the response.\n
with client.beta.threads.runs.stream(
thread_id=thread.id,
assistant_id=assistant.id,
instructions="Please address the user as Jane Doe. The user has a premium account.",
event_handler=EventHandler(),
) as stream:
stream.until_done()
`.trim(),
"node.js": `
const stream = openai.beta.threads.runs
.stream(thread.id, {
assistant_id: assistant.id,
})
.on("textCreated", () => console.log("assistant >"))
.on("toolCallCreated", (event) => console.log("assistant " + event.type))
.on("messageDone", async (event) => {
if (event.content[0].type === "text") {
const { text } = event.content[0];
const { annotations } = text;
const citations: string[] = [];\n
let index = 0;
for (let annotation of annotations) {
text.value = text.value.replace(annotation.text, "[" + index + "]");
const { file_citation } = annotation;
if (file_citation) {
const citedFile = await openai.files.retrieve(file_citation.file_id);
citations.push("[" + index + "]" + citedFile.filename);
}
index++;
}\n
console.log(text.value);
console.log(citations.join("\\n"));
}
`.trim(),
}}
/>
),
},
{
value: "without-streaming",
label: "Without streaming",
content: (
<CodeSample
defaultLanguage="python"
code={{
python: `
# Use the create and poll SDK helper to create a run and poll the status of
# the run until it's in a terminal state.\n
run = client.beta.threads.runs.create_and_poll(
thread_id=thread.id, assistant_id=assistant.id
)\n
messages = list(client.beta.threads.messages.list(thread_id=thread.id, run_id=run.id))\n
message_content = messages[0].content[0].text
annotations = message_content.annotations
citations = []
for index, annotation in enumerate(annotations):
message_content.value = message_content.value.replace(annotation.text, f"[{index}]")
if file_citation := getattr(annotation, "file_citation", None):
cited_file = client.files.retrieve(file_citation.file_id)
citations.append(f"[{index}] {cited_file.filename}")\n
print(message_content.value)
print("\\n".join(citations))
`.trim(),
"node.js": `
const run = await openai.beta.threads.runs.createAndPoll(thread.id, {
assistant_id: assistant.id,
});
const messages = await openai.beta.threads.messages.list(thread.id, {
run_id: run.id,
});
const message = messages.data.pop()!;
if (message.content[0].type === "text") {
const { text } = message.content[0];
const { annotations } = text;
const citations: string[] = [];\n
let index = 0;
for (let annotation of annotations) {
text.value = text.value.replace(annotation.text, "[" + index + "]");
const { file_citation } = annotation;
if (file_citation) {
const citedFile = await openai.files.retrieve(file_citation.file_id);
citations.push("[" + index + "]" + citedFile.filename);
}
index++;
}\n
console.log(text.value);
console.log(citations.join("\\n"));
}
`.trim(),
}}
/>
),
},
]}
initialValue="streaming"
/>
Your new assistant will query both attached vector stores (one containing `goog-10k.pdf` and `brka-10k.txt`, and the other containing `aapl-10k.pdf`) and return this result from `aapl-10k.pdf`.
---
## How it works
The `file_search` tool implements several retrieval best practices out of the box to help you extract the right data from your files and augment the models responses. The `file_search` tool:
- Rewrites user queries to optimize them for search.
- Breaks down complex user queries into multiple searches it can run in parallel.
- Runs both keyword and semantic searches across both assistant and thread vector stores.
- Reranks search results to pick the most relevant ones before generating the final response.
By default, the `file_search` tool uses the following settings but these can be [configured](/docs/assistants/tools/file-search/customizing-file-search-settings) to suit your needs:
- Chunk size: 800 tokens
- Chunk overlap: 400 tokens
- Embedding model: `text-embedding-3-large` at 256 dimensions
- Maximum number of chunks added to context: 20 (could be fewer)
**Known Limitations**
We have a few known limitations we're working on adding support for in the coming months:
1. Support for deterministic pre-search filtering using custom metadata.
2. Support for parsing images within documents (including images of charts, graphs, tables etc.)
3. Support for retrievals over structured file formats (like `csv` or `jsonl`).
4. Better support for summarization — the tool today is optimized for search queries.
## Vector stores
Vector Store objects give the File Search tool the ability to search your files. Adding a file to a `vector_store` automatically parses, chunks, embeds and stores the file in a vector database that's capable of both keyword and semantic search. Each `vector_store` can hold up to 10,000 files. Vector stores can be attached to both Assistants and Threads. Today, you can attach at most one vector store to an assistant and at most one vector store to a thread.
#### Creating vector stores and adding files
You can create a vector store and add files to it in a single API call:
<CodeSample
defaultLanguage="python"
code={{
python: `
vector_store = client.beta.vector_stores.create(
name="Product Documentation",
file_ids=['file_1', 'file_2', 'file_3', 'file_4', 'file_5']
)
`.trim(),
"node.js": `
const vectorStore = await openai.beta.vectorStores.create({
name: "Product Documentation",
file_ids: ['file_1', 'file_2', 'file_3', 'file_4', 'file_5']
});
`.trim(),
}}
/>
Adding files to vector stores is an async operation. To ensure the operation is complete, we recommend that you use the 'create and poll' helpers in our official SDKs. If you're not using the SDKs, you can retrieve the `vector_store` object and monitor it's [`file_counts`](/docs/api-reference/vector-stores/object#vector-stores/object-file_counts) property to see the result of the file ingestion operation.
Files can also be added to a vector store after it's created by [creating vector store files](/docs/api-reference/vector-stores/createFile).
<CodeSample
defaultLanguage="python"
code={{
python: `
file = client.beta.vector_stores.files.create_and_poll(
vector_store_id="vs_abc123",
file_id="file-abc123"
)
`.trim(),
"node.js": `
const file = await openai.beta.vectorStores.files.createAndPoll(
"vs_abc123",
{ file_id: "file-abc123" }
);
`.trim(),
}}
/>
Alternatively, you can add several files to a vector store by [creating batches](/docs/api-reference/vector-stores/createBatch) of up to 500 files.
<CodeSample
defaultLanguage="python"
code={{
python: `
batch = client.beta.vector_stores.file_batches.create_and_poll(
vector_store_id="vs_abc123",
file_ids=['file_1', 'file_2', 'file_3', 'file_4', 'file_5']
)
`.trim(),
"node.js": `
const batch = await openai.beta.vectorStores.fileBatches.createAndPoll(
"vs_abc123",
{ file_ids: ["file_1", "file_2", "file_3", "file_4", "file_5"] },
);
`.trim(),
}}
/>
Similarly, these files can be removed from a vector store by either:
- Deleting the [vector store file object](/docs/api-reference/vector-stores/deleteFile) or,
- By deleting the underlying [file object](/docs/api-reference/files/delete) (which removes the file it from all `vector_store` and `code_interpreter` configurations across all assistants and threads in your organization)
The maximum file size is 512 MB. Each file should contain no more than 5,000,000 tokens per file (computed automatically when you attach a file).
File Search supports a variety of file formats including `.pdf`, `.md`, and `.docx`. More details on the file extensions (and their corresponding MIME-types) supported can be found in the [Supported files](/docs/assistants/tools/file-search/supported-files) section below.
#### Attaching vector stores
You can attach vector stores to your Assistant or Thread using the `tool_resources` parameter.
<CodeSample
defaultLanguage="python"
code={{
python: `
assistant = client.beta.assistants.create(
instructions="You are a helpful product support assistant and you answer questions based on the files provided to you.",
model="gpt-4o",
tools=[{"type": "file_search"}],
tool_resources={
"file_search": {
"vector_store_ids": ["vs_1"]
}
}
)\n
thread = client.beta.threads.create(
messages=[ { "role": "user", "content": "How do I cancel my subscription?"} ],
tool_resources={
"file_search": {
"vector_store_ids": ["vs_2"]
}
}
)
`.trim(),
"node.js": `
const assistant = await openai.beta.assistants.create({
instructions: "You are a helpful product support assistant and you answer questions based on the files provided to you.",
model: "gpt-4o",
tools: [{"type": "file_search"}],
tool_resources: {
"file_search": {
"vector_store_ids": ["vs_1"]
}
}
});\n
const thread = await openai.beta.threads.create({
messages: [ { role: "user", content: "How do I cancel my subscription?"} ],
tool_resources: {
"file_search": {
"vector_store_ids": ["vs_2"]
}
}
});
`.trim(),
}}
/>
You can also attach a vector store to Threads or Assistants after they're created by updating them with the right `tool_resources`.
#### Ensuring vector store readiness before creating runs
We highly recommend that you ensure all files in a `vector_store` are fully processed before you create a run. This will ensure that all the data in your `vector_store` is searchable. You can check for `vector_store` readiness by using the polling helpers in our SDKs, or by manually polling the `vector_store` object to ensure the [`status`](/docs/api-reference/vector-stores/object#vector-stores/object-status) is `completed`.
As a fallback, we've built a **60 second maximum wait** in the Run object when the **threads** vector store contains files that are still being processed. This is to ensure that any files your users upload in a thread a fully searchable before the run proceeds. This fallback wait _does not_ apply to the assistant's vector store.
#### Customizing File Search settings
You can customize how the `file_search` tool chunks your data and how many chunks it returns to the model context.
**Chunking configuration**
By default, `max_chunk_size_tokens` is set to `800` and `chunk_overlap_tokens` is set to `400`, meaning every file is indexed by being split up into 800-token chunks, with 400-token overlap between consecutive chunks.
You can adjust this by setting [`chunking_strategy`](/docs/api-reference/vector-stores-files/createFile#vector-stores-files-createfile-chunking_strategy) when adding files to the vector store. There are certain limitations to `chunking_strategy`:
- `max_chunk_size_tokens` must be between 100 and 4096 inclusive.
- `chunk_overlap_tokens` must be non-negative and should not exceed `max_chunk_size_tokens / 2`.
**Number of chunks**
By default, the `file_search` tool outputs up to 20 chunks for `gpt-4*` models and up to 5 chunks for `gpt-3.5-turbo`. You can adjust this by setting [`file_search.max_num_results`](/docs/api-reference/assistants/createAssistant#assistants-createassistant-tools) in the tool when creating the assistant or the run.
Note that the `file_search` tool may output fewer than this number for a myriad of reasons:
- The total number of chunks is fewer than `max_num_results`.
- The total token size of all the retrieved chunks exceeds the token "budget" assigned to the `file_search` tool. The `file_search` tool currently has a token bugdet of:
- 4,000 tokens for `gpt-3.5-turbo`
- 16,000 tokens for `gpt-4*` models
#### Managing costs with expiration policies
The `file_search` tool uses the `vector_stores` object as its resource and you will be billed based on the [size](/docs/api-reference/vector-stores/object#vector-stores/object-bytes) of the `vector_store` objects created. The size of the vector store object is the sum of all the parsed chunks from your files and their corresponding embeddings.
You first GB is free and beyond that, usage is billed at $0.10/GB/day of vector storage. There are no other costs associated with vector store operations.
In order to help you manage the costs associated with these `vector_store` objects, we have added support for expiration policies in the `vector_store` object. You can set these policies when creating or updating the `vector_store` object.
<CodeSample
defaultLanguage="python"
code={{
python: `
vector_store = client.beta.vector_stores.create_and_poll(
name="Product Documentation",
file_ids=['file_1', 'file_2', 'file_3', 'file_4', 'file_5'],
expires_after={
"anchor": "last_active_at",
"days": 7
}
)
`.trim(),
"node.js": `
let vectorStore = await openai.beta.vectorStores.create({
name: "rag-store",
file_ids: ['file_1', 'file_2', 'file_3', 'file_4', 'file_5'],
expires_after: {
anchor: "last_active_at",
days: 7
}
});`.trim(),
}}
/>
**Thread vector stores have default expiration policies**
Vector stores created using thread helpers (like [`tool_resources.file_search.vector_stores`](/docs/api-reference/threads/createThread#threads-createthread-tool_resources) in Threads or [message.attachments](/docs/api-reference/messages/createMessage#messages-createmessage-attachments) in Messages) have a default expiration policy of 7 days after they were last active (defined as the last time the vector store was part of a run).
When a vector store expires, runs on that thread will fail. To fix this, you can simply recreate a new `vector_store` with the same files and reattach it to the thread.
<CodeSample
defaultLanguage="python"
code={{
python: `
all_files = list(client.beta.vector_stores.files.list("vs_expired"))\n
vector_store = client.beta.vector_stores.create(name="rag-store")
client.beta.threads.update(
"thread_abc123",
tool_resources={"file_search": {"vector_store_ids": [vector_store.id]}},
)\n
for file_batch in chunked(all_files, 100):
client.beta.vector_stores.file_batches.create_and_poll(
vector_store_id=vector_store.id, file_ids=[file.id for file in file_batch]
)
`.trim(),
"node.js": `
const fileIds = [];
for await (const file of openai.beta.vectorStores.files.list(
"vs_toWTk90YblRLCkbE2xSVoJlF",
)) {
fileIds.push(file.id);
}\n
const vectorStore = await openai.beta.vectorStores.create({
name: "rag-store",
});
await openai.beta.threads.update("thread_abcd", {
tool_resources: { file_search: { vector_store_ids: [vectorStore.id] } },
});\n
for (const fileBatch of _.chunk(fileIds, 100)) {
await openai.beta.vectorStores.fileBatches.create(vectorStore.id, {
file_ids: fileBatch,
});
}
`.trim(),
}}
/>
## Supported files
_For `text/` MIME types, the encoding must be one of `utf-8`, `utf-16`, or `ascii`._
| File format | MIME type |
| ----------- | --------------------------------------------------------------------------- |
| `.c` | `text/x-c` |
| `.cs` | `text/x-csharp` |
| `.cpp` | `text/x-c++` |
| `.doc` | `application/msword` |
| `.docx` | `application/vnd.openxmlformats-officedocument.wordprocessingml.document` |
| `.html` | `text/html` |
| `.java` | `text/x-java` |
| `.json` | `application/json` |
| `.md` | `text/markdown` |
| `.pdf` | `application/pdf` |
| `.php` | `text/x-php` |
| `.pptx` | `application/vnd.openxmlformats-officedocument.presentationml.presentation` |
| `.py` | `text/x-python` |
| `.py` | `text/x-script.python` |
| `.rb` | `text/x-ruby` |
| `.tex` | `text/x-tex` |
| `.txt` | `text/plain` |
| `.css` | `text/css` |
| `.js` | `text/javascript` |
| `.sh` | `application/x-sh` |
| `.ts` | `application/typescript` |

@ -0,0 +1,223 @@
Function calling Beta
Similar to the Chat Completions API, the Assistants API supports function calling. Function calling allows you to describe functions to the Assistants API and have it intelligently return the functions that need to be called along with their arguments.
## Quickstart
In this example, we'll create a weather assistant and define two functions,
`get_current_temperature` and `get_rain_probability`, as tools that the Assistant can call.
Depending on the user query, the model will invoke parallel function calling if using our
latest models released on or after Nov 6, 2023.
In our example that uses parallel function calling, we will ask the Assistant what the weather in
San Francisco is like today and the chances of rain. We also show how to output the Assistants response with streaming.
### Step 1: Define functions
When creating your assistant, you will first define the functions under the `tools` param of the assistant.
<CodeSample
defaultLanguage="python"
code={{
python: `
from openai import OpenAI
client = OpenAI()
assistant = client.beta.assistants.create(
instructions="You are a weather bot. Use the provided functions to answer questions.",
model="gpt-4o",
tools=[
{
"type": "function",
"function": {
"name": "get_current_temperature",
"description": "Get the current temperature for a specific location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g., San Francisco, CA"
},
"unit": {
"type": "string",
"enum": ["Celsius", "Fahrenheit"],
"description": "The temperature unit to use. Infer this from the user's location."
}
},
"required": ["location", "unit"]
}
}
},
{
"type": "function",
"function": {
"name": "get_rain_probability",
"description": "Get the probability of rain for a specific location",
"parameters": {
"type": "object",
"properties": {
"location": {
"type": "string",
"description": "The city and state, e.g., San Francisco, CA"
}
},
"required": ["location"]
}
}
}
]
)
`.trim(),
"node.js": `
const assistant = await client.beta.assistants.create({
model: "gpt-4o",
instructions:
"You are a weather bot. Use the provided functions to answer questions.",
tools: [
{
type: "function",
function: {
name: "getCurrentTemperature",
description: "Get the current temperature for a specific location",
parameters: {
type: "object",
properties: {
location: {
type: "string",
description: "The city and state, e.g., San Francisco, CA",
},
unit: {
type: "string",
enum: ["Celsius", "Fahrenheit"],
description:
"The temperature unit to use. Infer this from the user's location.",
},
},
required: ["location", "unit"],
},
},
},
{
type: "function",
function: {
name: "getRainProbability",
description: "Get the probability of rain for a specific location",
parameters: {
type: "object",
properties: {
location: {
type: "string",
description: "The city and state, e.g., San Francisco, CA",
},
},
required: ["location"],
},
},
},
],
});`.trim(),
}}
/>
### Step 2: Create a Thread and add Messages
Create a Thread when a user starts a conversation and add Messages to the Thread as the user asks questions.
<CodeSample
defaultLanguage="python"
code={{
python: `
thread = client.beta.threads.create()
message = client.beta.threads.messages.create(
thread_id=thread.id,
role="user",
content="What's the weather in San Francisco today and the likelihood it'll rain?",
)
`.trim(),
"node.js": `
const thread = await client.beta.threads.create();
const message = client.beta.threads.messages.create(thread.id, {
role: "user",
content: "What's the weather in San Francisco today and the likelihood it'll rain?",
});
`.trim(),
}}
/>
### Step 3: Initiate a Run
When you initiate a Run on a Thread containing a user Message that triggers one or more functions,
the Run will enter a `pending` status. After it processes, the run will enter a `requires_action` state which you can
verify by checking the Runs `status`. This indicates that you need to run tools and submit their outputs to the
Assistant to continue Run execution. In our case, we will see two `tool_calls`, which indicates that the
user query resulted in parallel function calling.
Note that a runs expire ten minutes after creation. Be sure to submit your tool
outputs before the 10 min mark.
You will see two `tool_calls` within `required_action`, which indicates the user query triggered parallel function calling.
<CodeSample
defaultLanguage="json"
code={{
json: `
{
"id": "run_qJL1kI9xxWlfE0z1yfL0fGg9",
...
"status": "requires_action",
"required_action": {
"submit_tool_outputs": {
"tool_calls": [
{
"id": "call_FthC9qRpsL5kBpwwyw6c7j4k",
"function": {
"arguments": "{\"location\": \"San Francisco, CA\"}",
"name": "get_rain_probability"
},
"type": "function"
},
{
"id": "call_RpEDoB8O0FTL9JoKTuCVFOyR",
"function": {
"arguments": "{\"location\": \"San Francisco, CA\", \"unit\": \"Fahrenheit\"}",
"name": "get_current_temperature"
},
"type": "function"
}
]
},
...
"type": "submit_tool_outputs"
}
}
`.trim(),
}}
/>
Run object truncated here for readability
How you initiate a Run and submit `tool_calls` will differ depending on whether you are using streaming or not,
although in both cases all `tool_calls` need to be submitted at the same time.
You can then complete the Run by submitting the tool outputs from the functions you called.
Pass each `tool_call_id` referenced in the `required_action` object to match outputs to each function call.
<ContentSwitcher
initialValue="streaming"
options={[
{
label: "With streaming",
value: "streaming",
content: ,
},
{
label: "Without streaming",
value: "without-streaming",
content: ,
},
]}
/>

@ -0,0 +1,446 @@
# Vision
Learn how to use vision capabilities to understand images.
## Introduction
Both GPT-4o and GPT-4 Turbo have vision capabilities, meaning the models can take in images and answer questions about them. Historically, language model systems have been limited by taking in a single input modality, text.
## Quick start
Images are made available to the model in two main ways: by passing a link to the image or by passing the base64 encoded image directly in the request. Images can be passed in the `user` messages.
<CodeSample
title="What's in this image?"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI\n
client = OpenAI()\n
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Whats in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
},
},
],
}
],
max_tokens=300,
)\n
print(response.choices[0])
`.trim(),
curl: `
curl https://api.openai.com/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
}
}
]
}
],
"max_tokens": 300
}'
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const response = await openai.chat.completions.create({
model: "gpt-4o",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Whats in this image?" },
{
type: "image_url",
image_url: {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
},
},
],
},
],
});
console.log(response.choices[0]);
}
main();
`.trim(),
}}
/>
The model is best at answering general questions about what is present in the images. While it does understand the relationship between objects in images, it is not yet optimized to answer detailed questions about the location of certain objects in an image. For example, you can ask it what color a car is or what some ideas for dinner might be based on what is in you fridge, but if you show it an image of a room and ask it where the chair is, it may not answer the question correctly.
It is important to keep in mind the [limitations of the model](/docs/guides/vision/limitations) as you explore what use-cases visual understanding can be applied to.
<a
href="https://cookbook.openai.com/examples/gpt_with_vision_for_video_understanding"
target="_blank"
>
<IconItem
icon={}
color="red"
title="Video understanding with vision"
className="mt-6"
>
Learn how to use use GPT-4 with Vision to understand videos in the OpenAI Cookbook
## Uploading base 64 encoded images
If you have an image or set of images locally, you can pass those to the model in base 64 encoded format, here is an example of this in action:
```python
import base64
import requests
# OpenAI API Key
api_key = "YOUR_OPENAI_API_KEY"
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
# Path to your image
image_path = "path_to_your_image.jpg"
# Getting the base64 string
base64_image = encode_image(image_path)
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}"
}
payload = {
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
}
}
]
}
],
"max_tokens": 300
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
print(response.json())
```
## Multiple image inputs
The Chat Completions API is capable of taking in and processing multiple image inputs in both base64 encoded format or as an image URL. The model will process each image and use the information from all of them to answer the question.
<CodeSample
title="Multiple image inputs"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI\n
client = OpenAI()
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are in these images? Is there any difference between them?",
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
},
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
},
},
],
}
],
max_tokens=300,
)
print(response.choices[0])
`.trim(),
curl: `
curl https://api.openai.com/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What are in these images? Is there any difference between them?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
}
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
}
}
]
}
],
"max_tokens": 300
}'
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const response = await openai.chat.completions.create({
model: "gpt-4o",
messages: [
{
role: "user",
content: [
{ type: "text", text: "What are in these images? Is there any difference between them?" },
{
type: "image_url",
image_url: {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
},
},
{
type: "image_url",
image_url: {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
},
}
],
},
],
});
console.log(response.choices[0]);
}
main();
`.trim(),
}}
/>
Here the model is shown two copies of the same image and can answer questions about both or each of the images independently.
## Low or high fidelity image understanding
By controlling the `detail` parameter, which has three options, `low`, `high`, or `auto`, you have control over how the model processes the image and generates its textual understanding. By default, the model will use the `auto` setting which will look at the image input size and decide if it should use the `low` or `high` setting.
- `low` will enable the "low res" mode. The model will receive a low-res 512px x 512px version of the image, and represent the image with a budget of 85 tokens. This allows the API to return faster responses and consume fewer input tokens for use cases that do not require high detail.
- `high` will enable "high res" mode, which first allows the model to first see the low res image (using 85 tokens) and then creates detailed crops using 170 tokens for each 512px x 512px tile.
<CodeSample
title="Choosing the detail level"
defaultLanguage="python"
code={{
python: `
from openai import OpenAI\n
client = OpenAI()\n
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": "Whats in this image?"},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"detail": "high"
},
},
],
}
],
max_tokens=300,
)\n
print(response.choices[0].message.content)
`.trim(),
curl: `
curl https://api.openai.com/v1/chat/completions \\
-H "Content-Type: application/json" \\
-H "Authorization: Bearer $OPENAI_API_KEY" \\
-d '{
"model": "gpt-4o",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Whats in this image?"
},
{
"type": "image_url",
"image_url": {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"detail": "high"
}
}
]
}
],
"max_tokens": 300
}'
`.trim(),
node: `
import OpenAI from "openai";\n
const openai = new OpenAI();\n
async function main() {
const response = await openai.chat.completions.create({
model: "gpt-4o",
messages: [
{
role: "user",
content: [
{ type: "text", text: "Whats in this image?" },
{
type: "image_url",
image_url: {
"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
"detail": "low"
},
},
],
},
],
});
console.log(response.choices[0]);
}
main();
`.trim(),
}}
/>
## Managing images
The Chat Completions API, unlike the Assistants API, is not stateful. That means you have to manage the messages (including images) you pass to the model yourself. If you want to pass the same image to the model multiple times, you will have to pass the image each time you make a request to the API.
For long running conversations, we suggest passing images via URL's instead of base64. The latency of the model can also be improved by downsizing your images ahead of time to be less than the maximum size they are expected them to be. For low res mode, we expect a 512px x 512px image. For high res mode, the short side of the image should be less than 768px and the long side should be less than 2,000px.
After an image has been processed by the model, it is deleted from OpenAI servers and not retained. [We do not use data uploaded via the OpenAI API to train our models](https://openai.com/enterprise-privacy).
## Limitations
While GPT-4 with vision is powerful and can be used in many situations, it is important to understand the limitations of the model. Here are some of the limitations we are aware of:
- Medical images: The model is not suitable for interpreting specialized medical images like CT scans and shouldn't be used for medical advice.
- Non-English: The model may not perform optimally when handling images with text of non-Latin alphabets, such as Japanese or Korean.
- Small text: Enlarge text within the image to improve readability, but avoid cropping important details.
- Rotation: The model may misinterpret rotated / upside-down text or images.
- Visual elements: The model may struggle to understand graphs or text where colors or styles like solid, dashed, or dotted lines vary.
- Spatial reasoning: The model struggles with tasks requiring precise spatial localization, such as identifying chess positions.
- Accuracy: The model may generate incorrect descriptions or captions in certain scenarios.
- Image shape: The model struggles with panoramic and fisheye images.
- Metadata and resizing: The model doesn't process original file names or metadata, and images are resized before analysis, affecting their original dimensions.
- Counting: May give approximate counts for objects in images.
- CAPTCHAS: For safety reasons, we have implemented a system to block the submission of CAPTCHAs.
## Calculating costs
Image inputs are metered and charged in tokens, just as text inputs are. The token cost of a given image is determined by two factors: its size, and the `detail` option on each image_url block. All images with `detail: low` cost 85 tokens each. `detail: high` images are first scaled to fit within a 2048 x 2048 square, maintaining their aspect ratio. Then, they are scaled such that the shortest side of the image is 768px long. Finally, we count how many 512px squares the image consists of. Each of those squares costs **170 tokens**. Another **85 tokens** are always added to the final total.
Here are some examples demonstrating the above.
- A 1024 x 1024 square image in `detail: high` mode costs 765 tokens
- 1024 is less than 2048, so there is no initial resize.
- The shortest side is 1024, so we scale the image down to 768 x 768.
- 4 512px square tiles are needed to represent the image, so the final token cost is `170 * 4 + 85 = 765`.
- A 2048 x 4096 image in `detail: high` mode costs 1105 tokens
- We scale down the image to 1024 x 2048 to fit within the 2048 square.
- The shortest side is 1024, so we further scale down to 768 x 1536.
- 6 512px tiles are needed, so the final token cost is `170 * 6 + 85 = 1105`.
- A 4096 x 8192 image in `detail: low` most costs 85 tokens
- Regardless of input size, low detail images are a fixed cost.
## FAQ
### Can I fine-tune the image capabilities in `gpt-4`?
No, we do not support fine-tuning the image capabilities of `gpt-4` at this time.
### Can I use `gpt-4` to generate images?
No, you can use `dall-e-3` to generate images and `gpt-4o` or `gpt-4-turbo` to understand images.
### What type of files can I upload?
We currently support PNG (.png), JPEG (.jpeg and .jpg), WEBP (.webp), and non-animated GIF (.gif).
### Is there a limit to the size of the image I can upload?
Yes, we restrict image uploads to 20MB per image.
### Can I delete an image I uploaded?
No, we will delete the image for you automatically after it has been processed by the model.
### Where can I learn more about the considerations of GPT-4 with Vision?
You can find details about our evaluations, preparation, and mitigation work in the [GPT-4 with Vision system card](https://openai.com/contributions/gpt-4v).
We have further implemented a system to block the submission of CAPTCHAs.
### How do rate limits for GPT-4 with Vision work?
We process images at the token level, so each image we process counts towards your tokens per minute (TPM) limit. See the calculating costs section for details on the formula used to determine token count per image.
### Can GPT-4 with Vision understand image metadata?
No, the model does not receive image metadata.
### What happens if my image is unclear?
If an image is ambiguous or unclear, the model will do its best to interpret it. However, the results may be less accurate. A good rule of thumb is that if an average human cannot see the info in an image at the resolutions used in low/high res mode, then the model cannot either.

@ -0,0 +1,21 @@
What's new in v2? Beta
## April 2024
We are announcing a variety of new features and improvements to the Assistants API and moving our Beta to a new API version, `OpenAI-Beta: assistants=v2`. Here's what's new:
- We're launching an [improved retrieval tool called `file_search`](/docs/assistants/tools/file-search), which can ingest up to 10,000 files per assistant - 500x more than before. It is faster, supports parallel queries through multi-threaded searches, and features enhanced reranking and query rewriting.
- Alongside `file_search`, we're introducing [`vector_store` objects](/docs/assistants/tools/file-search/vector-stores) in the API. Once a file is added to a vector store, it's automatically parsed, chunked, and embedded, made ready to be searched. Vector stores can be used across assistants and threads, simplifying file management and billing.
- You can now [control the maximum number of tokens](/docs/assistants/how-it-works/context-window-management) a run uses in the Assistants API, allowing you to manage token usage costs. You can also set limits on the number of previous / recent messages used in each run.
- We've added support for the [`tool_choice` parameter](/docs/api-reference/runs/object#runs/object-tool_choice) which can be used to force the use of a specific tool (like `file_search`, `code_interpreter`, or a `function`) in a particular run.
- You can now [create messages with the role `assistant`](/docs/api-reference/messages/createMessage#messages-createmessage-role) to create custom conversation histories in Threads.
- Assistant and Run objects now support popular model configuration parameters like [`temperature`](/docs/api-reference/assistants/createAssistant#assistants-createassistant-temperature), [`response_format` (JSON mode)](/docs/api-reference/assistants/createAssistant#assistants-createassistant-response_format), and [`top_p`](/docs/api-reference/assistants/createAssistant#assistants-createassistant-top_p).
- You can now use [fine-tuned models](/docs/guides/fine-tuning) in the Assistants API. At the moment, only fine-tuned versions of `gpt-3.5-turbo-0125` are supported.
- Assistants API now supports [streaming](/docs/assistants/overview/step-4-create-a-run?context=with-streaming).
- We've added several streaming and polling helpers to our [Node](https://github.com/openai/openai-node/blob/master/helpers.md) and [Python](https://github.com/openai/openai-python/blob/main/helpers.md) SDKs.
See our [migration guide](/docs/assistants/migration) to learn more about how to migrate your tool usage to the latest version of the Assistants API.

Binary file not shown.

After

Width:  |  Height:  |  Size: 163 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 191 KiB

@ -1326,14 +1326,16 @@
- vision
- whisper
# - title: Using Azure Function and the Microsoft Graph Search API to Query Office365 and SharePoint with ChatGPT
# path: examples/chatgpt/sharepoint_azure_function/Using_Azure_Functions_and_Microsoft_Graph_to_Query_SharePoint.md
# date: 2024-05-24
# authors:
# - maxreid-openai
# tags:
# - completions
# - chatgpt
- title: Azure AI Search with Azure Functions and GPT Actions in ChatGPT
path: examples/chatgpt/rag-quickstart/azure/Azure_AI_Search_with_Azure_Functions_and_GPT_Actions_in_ChatGPT.ipynb
date: 2024-07-08
authors:
- maxreid-openai
tags:
- embeddings
- chatgpt
- tiktoken
- completions
- title: GPT Actions library - getting started
path: examples/chatgpt/gpt_actions_library/.gpt_action_getting_started.ipynb
@ -1433,4 +1435,4 @@
- rupert-openai
tags:
- gpt-actions-library
- chatgpt
- chatgpt

Loading…
Cancel
Save