Update embeddings_utils.py and related notebooks to API V1 (issue #855) (#857)

pull/863/head
Gabor Cselle 7 months ago committed by GitHub
parent bd58636a8a
commit 988139d70e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -235,10 +235,10 @@
}
],
"source": [
"from openai.embeddings_utils import get_embedding\n",
"from utils.embeddings_utils import get_embedding\n",
"\n",
"df = pd.DataFrame(all_funcs)\n",
"df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))\n",
"df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, model='text-embedding-ada-002'))\n",
"df['filepath'] = df['filepath'].map(lambda x: Path(x).relative_to(code_root))\n",
"df.to_csv(\"data/code_search_openai-python.csv\", index=False)\n",
"df.head()"
@ -266,10 +266,10 @@
"metadata": {},
"outputs": [],
"source": [
"from openai.embeddings_utils import cosine_similarity\n",
"from utils.embeddings_utils import cosine_similarity\n",
"\n",
"def search_functions(df, code_query, n=3, pprint=True, n_lines=7):\n",
" embedding = get_embedding(code_query, engine='text-embedding-ada-002')\n",
" embedding = get_embedding(code_query, model='text-embedding-ada-002')\n",
" df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))\n",
"\n",
" res = df.sort_values('similarities', ascending=False).head(n)\n",

@ -51,7 +51,7 @@
"from sklearn.model_selection import train_test_split # for splitting train & test data\n",
"import torch # for matrix optimization\n",
"\n",
"from openai.embeddings_utils import get_embedding, cosine_similarity # for embeddings\n"
"from utils.embeddings_utils import get_embedding, cosine_similarity # for embeddings\n"
]
},
{

@ -193,7 +193,7 @@
"source": [
"def request_completion(prompt):\n",
"\n",
" completion_response = openai.Completion.create(\n",
" completion_response = openai.completions.create(\n",
" prompt=prompt,\n",
" temperature=0,\n",
" max_tokens=5,\n",
@ -211,7 +211,7 @@
" prompt = prompt.replace('DESCRIPTION_TEXT',transaction['Description'])\n",
" prompt = prompt.replace('TRANSACTION_VALUE',str(transaction['Transaction value (£)']))\n",
"\n",
" classification = request_completion(prompt)['choices'][0]['text'].replace('\\n','')\n",
" classification = request_completion(prompt).choices[0].text.replace('\\n','')\n",
"\n",
" return classification\n",
"\n",
@ -304,7 +304,7 @@
"\n",
"# Use our completion function to return a prediction\n",
"completion_response = request_completion(prompt)\n",
"print(completion_response['choices'][0]['text'])\n"
"print(completion_response.choices[0].text)\n"
]
},
{
@ -351,7 +351,7 @@
" Building Improvement 14\n",
" Could not classify 5\n",
" Literature & Archive 3\n",
" Software/IT 2\n",
" Software/IT 2\n",
" Utility Bills 1\n",
"Name: Classification, dtype: int64"
]
@ -916,8 +916,8 @@
"source": [
"from utils.embeddings_utils import get_embedding\n",
"\n",
"df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, engine='text-similarity-babbage-001'))\n",
"df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, engine='text-search-babbage-doc-001'))\n",
"df['babbage_similarity'] = df.combined.apply(lambda x: get_embedding(x, model='text-similarity-babbage-001'))\n",
"df['babbage_search'] = df.combined.apply(lambda x: get_embedding(x, model='text-search-babbage-doc-001'))\n",
"df.to_csv(embedding_path)\n"
]
},
@ -2203,7 +2203,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.8"
"version": "3.11.3"
}
},
"nbformat": 4,

@ -59,7 +59,7 @@
"def search_reviews(df, product_description, n=3, pprint=True):\n",
" product_embedding = get_embedding(\n",
" product_description,\n",
" engine=\"text-embedding-ada-002\"\n",
" model=\"text-embedding-ada-002\"\n",
" )\n",
" df[\"similarity\"] = df.embedding.apply(lambda x: cosine_similarity(x, product_embedding))\n",
"\n",

@ -138,7 +138,7 @@
"source": [
"from utils.embeddings_utils import get_embeddings\n",
"# NOTE: The following code will send a query of batch size 200 to /embeddings\n",
"matrix = get_embeddings(samples[\"text\"].to_list(), engine=\"text-embedding-ada-002\")\n"
"matrix = get_embeddings(samples[\"text\"].to_list(), model=\"text-embedding-ada-002\")\n"
]
},
{

@ -93,7 +93,7 @@
" labels = ['negative', 'positive'],\n",
" model = EMBEDDING_MODEL,\n",
"):\n",
" label_embeddings = [get_embedding(label, engine=model) for label in labels]\n",
" label_embeddings = [get_embedding(label, model=model) for label in labels]\n",
"\n",
" def label_score(review_embedding, label_embeddings):\n",
" return cosine_similarity(review_embedding, label_embeddings[1]) - cosine_similarity(review_embedding, label_embeddings[0])\n",

@ -15,51 +15,53 @@ import pandas as pd
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text: str, engine="text-similarity-davinci-001", **kwargs) -> List[float]:
def get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) -> List[float]:
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0]["embedding"]
response = openai.embeddings.create(input=[text], model=model, **kwargs)
return response.data[0].embedding
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
async def aget_embedding(
text: str, engine="text-similarity-davinci-001", **kwargs
text: str, model="text-similarity-davinci-001", **kwargs
) -> List[float]:
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
return (await openai.Embedding.acreate(input=[text], engine=engine, **kwargs))["data"][0][
return (await openai.embeddings.create(input=[text], model=model, **kwargs))["data"][0][
"embedding"
]
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embeddings(
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs
list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
) -> List[List[float]]:
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
# replace newlines, which can negatively affect performance.
list_of_text = [text.replace("\n", " ") for text in list_of_text]
data = openai.Embedding.create(input=list_of_text, engine=engine, **kwargs).data
return [d["embedding"] for d in data]
data = openai.embeddings.create(input=list_of_text, model=model, **kwargs).data
return [d.embedding for d in data]
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
async def aget_embeddings(
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs
list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
) -> List[List[float]]:
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
# replace newlines, which can negatively affect performance.
list_of_text = [text.replace("\n", " ") for text in list_of_text]
data = (await openai.Embedding.acreate(input=list_of_text, engine=engine, **kwargs)).data
return [d["embedding"] for d in data]
data = (await openai.embeddings.create(input=list_of_text, model=model, **kwargs)).data
return [d.embedding for d in data]
def cosine_similarity(a, b):

Loading…
Cancel
Save