|
|
|
@ -15,51 +15,53 @@ import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
|
|
|
|
def get_embedding(text: str, engine="text-similarity-davinci-001", **kwargs) -> List[float]:
|
|
|
|
|
def get_embedding(text: str, model="text-similarity-davinci-001", **kwargs) -> List[float]:
|
|
|
|
|
|
|
|
|
|
# replace newlines, which can negatively affect performance.
|
|
|
|
|
text = text.replace("\n", " ")
|
|
|
|
|
|
|
|
|
|
return openai.Embedding.create(input=[text], engine=engine, **kwargs)["data"][0]["embedding"]
|
|
|
|
|
response = openai.embeddings.create(input=[text], model=model, **kwargs)
|
|
|
|
|
|
|
|
|
|
return response.data[0].embedding
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
|
|
|
|
async def aget_embedding(
|
|
|
|
|
text: str, engine="text-similarity-davinci-001", **kwargs
|
|
|
|
|
text: str, model="text-similarity-davinci-001", **kwargs
|
|
|
|
|
) -> List[float]:
|
|
|
|
|
|
|
|
|
|
# replace newlines, which can negatively affect performance.
|
|
|
|
|
text = text.replace("\n", " ")
|
|
|
|
|
|
|
|
|
|
return (await openai.Embedding.acreate(input=[text], engine=engine, **kwargs))["data"][0][
|
|
|
|
|
return (await openai.embeddings.create(input=[text], model=model, **kwargs))["data"][0][
|
|
|
|
|
"embedding"
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
|
|
|
|
def get_embeddings(
|
|
|
|
|
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs
|
|
|
|
|
list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
|
|
|
|
|
) -> List[List[float]]:
|
|
|
|
|
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
|
|
|
|
|
|
|
|
|
|
# replace newlines, which can negatively affect performance.
|
|
|
|
|
list_of_text = [text.replace("\n", " ") for text in list_of_text]
|
|
|
|
|
|
|
|
|
|
data = openai.Embedding.create(input=list_of_text, engine=engine, **kwargs).data
|
|
|
|
|
return [d["embedding"] for d in data]
|
|
|
|
|
data = openai.embeddings.create(input=list_of_text, model=model, **kwargs).data
|
|
|
|
|
return [d.embedding for d in data]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
|
|
|
|
|
async def aget_embeddings(
|
|
|
|
|
list_of_text: List[str], engine="text-similarity-babbage-001", **kwargs
|
|
|
|
|
list_of_text: List[str], model="text-similarity-babbage-001", **kwargs
|
|
|
|
|
) -> List[List[float]]:
|
|
|
|
|
assert len(list_of_text) <= 2048, "The batch size should not be larger than 2048."
|
|
|
|
|
|
|
|
|
|
# replace newlines, which can negatively affect performance.
|
|
|
|
|
list_of_text = [text.replace("\n", " ") for text in list_of_text]
|
|
|
|
|
|
|
|
|
|
data = (await openai.Embedding.acreate(input=list_of_text, engine=engine, **kwargs)).data
|
|
|
|
|
return [d["embedding"] for d in data]
|
|
|
|
|
data = (await openai.embeddings.create(input=list_of_text, model=model, **kwargs)).data
|
|
|
|
|
return [d.embedding for d in data]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cosine_similarity(a, b):
|
|
|
|
|