mirror of
https://github.com/openai/openai-cookbook
synced 2024-11-04 06:00:33 +00:00
Remove legacy transition guides from 2021/2022 (#718)
This commit is contained in:
parent
fd4e31bb00
commit
78c6ed57ca
@ -1,50 +0,0 @@
|
||||
# Deprecation of Answers, Classification, and Search
|
||||
|
||||
In 2021, OpenAI released specialized endpoints in beta for Answers, Classification, and Search.
|
||||
|
||||
While these specialized endpoints were convenient, they had two drawbacks:
|
||||
|
||||
1. These specialized endpoints were eclipsed by techniques that achieved better results.
|
||||
2. These specialized endpoints were more difficult to customize and optimize for individual use cases.
|
||||
|
||||
As a result, **the Answers, Classifications, and Search endpoints are being deprecated.**
|
||||
|
||||
## Timeline of deprecation
|
||||
|
||||
For those who have not used these endpoints, nothing will change except that access will no longer be available.
|
||||
|
||||
**For existing users of these endpoints, access will continue until December 3, 2022.** Before that date, we strongly encourage developers to switch over to newer techniques which produce better results.
|
||||
|
||||
## How to transition
|
||||
|
||||
We've written guides and code examples for transitioning from the deprecated API endpoints to better methods.
|
||||
|
||||
### Answers
|
||||
|
||||
[Guide: How to transition off the Answers endpoint](https://help.openai.com/en/articles/6233728-answers-transition-guide)
|
||||
|
||||
* Option 1: transition to embeddings-based search **(recommended)**
|
||||
* Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
|
||||
|
||||
* Option 2: reimplement Answers endpoint functionality
|
||||
* Example code: [answers_functionality_example.py](answers_functionality_example.py)
|
||||
|
||||
### Classification
|
||||
|
||||
[Guide: How to transition off the Classifications endpoint](https://help.openai.com/en/articles/6272941-classifications-transition-guide)
|
||||
|
||||
* Option 1: transition to fine-tuning **(recommended)**
|
||||
* Example code: [Fine-tuned_classification.ipynb](../examples/Fine-tuned_classification.ipynb)
|
||||
* Option 2: transition to embeddings
|
||||
* Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
|
||||
* Option 3: reimplement Classifications endpoint functionality
|
||||
* Example code: [classification_functionality_example.py](classification_functionality_example.py)
|
||||
|
||||
### Search
|
||||
|
||||
[Guide: How to transition off the Search endpoint](https://help.openai.com/en/articles/6272952-search-transition-guide)
|
||||
|
||||
* Option 1: transition to embeddings-based search **(recommended)**
|
||||
* Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
|
||||
* Option 2: reimplement Search endpoint functionality
|
||||
* Example code: [search_functionality_example.py](search_functionality_example.py)
|
@ -1,304 +0,0 @@
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
import openai
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
|
||||
MAX_TOKENS_LIMIT = 2048
|
||||
ANSWERS_INSTRUCTION = "Please answer the question according to the above context.\n"
|
||||
CONTEXT_TEMPLATE = "===\nContext: {context}\n===\n"
|
||||
|
||||
|
||||
def extract_instruction(instruction):
|
||||
"""
|
||||
Extract `instruction` parameter and format it properly.
|
||||
If not exist, return empty string.
|
||||
"""
|
||||
if instruction is None:
|
||||
return ""
|
||||
|
||||
return f"{instruction.strip()}\n\n"
|
||||
|
||||
|
||||
def semantic_search(
|
||||
search_model, query_for_search, file_id=None, max_documents=None, examples=None
|
||||
):
|
||||
"""
|
||||
:param examples: A list of {"text":...} or {"text": ..., "label": ...}.
|
||||
:return:
|
||||
a list of semantic search result dict of documents sorted by "score":
|
||||
[
|
||||
{
|
||||
"document": ...,
|
||||
"object": "search_result",
|
||||
"score": ...,
|
||||
"text": ...,
|
||||
},
|
||||
...
|
||||
]
|
||||
"""
|
||||
assert (examples is None) ^ (file_id is None) # xor
|
||||
|
||||
if file_id is not None:
|
||||
# This is where you'd do an elastic search call. Since there isn't an example of this
|
||||
# we can query, we'll raise an error.
|
||||
# The return value from this would be a list of examples
|
||||
raise NotImplementedError()
|
||||
|
||||
# This isn't quite accurate since Search is also being deprecated. See our search guide for more
|
||||
# information.
|
||||
|
||||
search_result = openai.Search.create(
|
||||
model=search_model,
|
||||
documents=[x["text"] for x in examples],
|
||||
query=query_for_search,
|
||||
)
|
||||
|
||||
info_dict = {d["document"]: d for d in search_result["data"]}
|
||||
sorted_doc_ids = sorted(
|
||||
info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True
|
||||
)
|
||||
if max_documents:
|
||||
sorted_doc_ids = sorted_doc_ids[:max_documents]
|
||||
return [info_dict[i] for i in sorted_doc_ids]
|
||||
|
||||
|
||||
def select_by_length(
|
||||
sorted_doc_infos,
|
||||
max_token_len,
|
||||
lambda_fn=None,
|
||||
):
|
||||
"""
|
||||
Give a list of (document ID, document content in string), we will select as many
|
||||
documents as possible as long as the total length does not go above `max_token_len`.
|
||||
|
||||
:param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score".
|
||||
:param max_token_len: The maximum token length for selected documents.
|
||||
:param lambda_fn: A function that takes in search results dict and output a formatted
|
||||
example for context stuffing.
|
||||
:return: A tuple of (
|
||||
A concatenation of selected documents used as context,
|
||||
A list of selected document IDs
|
||||
)
|
||||
"""
|
||||
if not sorted_doc_infos:
|
||||
return "", []
|
||||
|
||||
selected_indices = []
|
||||
total_doc_tokens = 0
|
||||
doc_dict = {}
|
||||
for i, doc_info in enumerate(sorted_doc_infos):
|
||||
doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"]
|
||||
n_doc_tokens = len(tokenizer.encode(doc))
|
||||
if total_doc_tokens + n_doc_tokens < max_token_len:
|
||||
total_doc_tokens += n_doc_tokens
|
||||
selected_indices.append(i)
|
||||
doc_dict[i] = doc
|
||||
|
||||
# The top ranked documents should go at the end.
|
||||
selected_indices = selected_indices[::-1]
|
||||
|
||||
context = "".join([doc_dict[i] for i in selected_indices])
|
||||
selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices]
|
||||
return context, selected_doc_infos
|
||||
|
||||
|
||||
def answers(
|
||||
examples,
|
||||
question,
|
||||
model,
|
||||
examples_context,
|
||||
file_id=None,
|
||||
documents=None,
|
||||
logit_bias=None,
|
||||
max_rerank=200,
|
||||
max_tokens=16,
|
||||
alternative_question=None,
|
||||
search_model="ada",
|
||||
temperature=0.0,
|
||||
logprobs=0,
|
||||
stop=None,
|
||||
n=1,
|
||||
):
|
||||
"""
|
||||
Given a prompt, a question, a list of (question, answer) pairs as examples, and
|
||||
a list of documents for context, it tries to include all the QA examples and top
|
||||
relevant context documents.
|
||||
|
||||
The constructed prompt for the final completion call:
|
||||
```
|
||||
Please answer the question according to the above context.
|
||||
|
||||
===
|
||||
Context: {{ the context for example QA pairs. }}
|
||||
===
|
||||
Q: example 1 question
|
||||
A: example 1 answer
|
||||
---
|
||||
Q: example 2 question
|
||||
A: example 2 answer
|
||||
===
|
||||
Context: {{ a list of relevant documents sorted via search(question, documents) }}
|
||||
===
|
||||
Q: question
|
||||
A:
|
||||
```
|
||||
|
||||
The returned object has a structure like:
|
||||
{
|
||||
"answers": [
|
||||
"Beijing",
|
||||
"Beijing, China"
|
||||
],
|
||||
"completion_id": "xxx-xxx",
|
||||
"object": "answer",
|
||||
"selected_documents": [
|
||||
{
|
||||
"document": ..., # document index, same as in search/ results.
|
||||
"object": "search_result",
|
||||
"text": ...,
|
||||
},
|
||||
...
|
||||
],
|
||||
}
|
||||
"""
|
||||
|
||||
examples = examples if examples else []
|
||||
|
||||
example_prompts = [f"Q: {x}\nA: {y}" for x, y in examples]
|
||||
prompt = f"Q: {question}\nA:"
|
||||
|
||||
# Append all the QA examples into the prompt.
|
||||
if examples_context:
|
||||
examples_context = CONTEXT_TEMPLATE.format(context=examples_context)
|
||||
instruction = (
|
||||
ANSWERS_INSTRUCTION + examples_context + "\n---\n".join(example_prompts) + "\n"
|
||||
)
|
||||
|
||||
logit_bias = logit_bias if logit_bias is not None else {}
|
||||
|
||||
if file_id is None and documents is None:
|
||||
raise Exception("Please submit at least one of `documents` or `file`.")
|
||||
if file_id is not None and documents is not None:
|
||||
raise Exception("Please submit only one of `documents` or `file`.")
|
||||
|
||||
instruction = extract_instruction(instruction)
|
||||
|
||||
n_instruction_tokens = len(tokenizer.encode(instruction))
|
||||
n_prompt_tokens = len(tokenizer.encode(prompt))
|
||||
n_query_tokens = len(tokenizer.encode(question))
|
||||
n_context_tokens = len(tokenizer.encode(CONTEXT_TEMPLATE.format(context="")))
|
||||
|
||||
if documents is not None:
|
||||
documents = [doc.strip() + " " for doc in documents]
|
||||
n_docs_tokens = [len(tokenizer.encode(doc)) for doc in documents]
|
||||
|
||||
# Except all the required content, how many tokens left for context stuffing.
|
||||
leftover_token_len = MAX_TOKENS_LIMIT - (
|
||||
n_instruction_tokens + n_context_tokens + n_prompt_tokens + max_tokens
|
||||
)
|
||||
sorted_doc_infos = []
|
||||
|
||||
question_for_search = (
|
||||
alternative_question if alternative_question is not None else question
|
||||
)
|
||||
if file_id is not None:
|
||||
search_model_, sorted_doc_infos = semantic_search(
|
||||
search_model,
|
||||
question_for_search,
|
||||
file_id=file_id,
|
||||
max_documents=max_rerank,
|
||||
)
|
||||
|
||||
elif len(documents) == 0:
|
||||
# If no context document is provided, do nothing.
|
||||
pass
|
||||
|
||||
elif min(n_docs_tokens) >= leftover_token_len:
|
||||
# If there is no room for adding any context doc.
|
||||
pass
|
||||
|
||||
elif (max_rerank is None or max_rerank >= len(documents)) and sum(
|
||||
n_docs_tokens
|
||||
) < leftover_token_len:
|
||||
# If the total length of docs is short enough to be added all.
|
||||
selected_indices = list(range(len(documents)))
|
||||
|
||||
sorted_doc_infos = [
|
||||
{"document": i, "text": documents[i]} for i in selected_indices
|
||||
]
|
||||
|
||||
elif n_query_tokens + max(n_docs_tokens) >= MAX_TOKENS_LIMIT:
|
||||
# If the prompt and the longest document together go above the limit.
|
||||
total_tokens = n_query_tokens + max(n_docs_tokens)
|
||||
raise Exception(
|
||||
f"The longest document and prompt pair together contains {total_tokens} "
|
||||
f"tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. Please consider "
|
||||
f"shortening the prompt or the longest document."
|
||||
)
|
||||
|
||||
else:
|
||||
# If we can add some context documents but not all of them, we should
|
||||
# query search endpoint to rank docs by score.
|
||||
sorted_doc_infos = semantic_search(
|
||||
search_model,
|
||||
question_for_search,
|
||||
examples=[{"text": doc} for doc in documents],
|
||||
max_documents=max_rerank,
|
||||
)
|
||||
|
||||
# Select documents w.r.t. the context length limitation.
|
||||
context, sorted_doc_infos = select_by_length(
|
||||
sorted_doc_infos,
|
||||
leftover_token_len,
|
||||
lambda_fn=lambda x: x["text"].strip() + " ",
|
||||
)
|
||||
|
||||
# Add instruction before the context and the prompt after the context.
|
||||
if context:
|
||||
context = CONTEXT_TEMPLATE.format(context=context.strip())
|
||||
full_prompt = instruction + context + prompt
|
||||
|
||||
completion_result = openai.Completion.create(
|
||||
engine=model,
|
||||
prompt=full_prompt,
|
||||
logit_bias=logit_bias,
|
||||
temperature=temperature,
|
||||
n=n,
|
||||
max_tokens=max_tokens,
|
||||
stop=stop,
|
||||
logprobs=logprobs,
|
||||
)
|
||||
|
||||
completion_result["selected_documents"] = sorted_doc_infos
|
||||
|
||||
result = dict(
|
||||
object="answer",
|
||||
selected_documents=completion_result.pop("selected_documents"),
|
||||
completion=completion_result["id"],
|
||||
)
|
||||
|
||||
result["answers"] = [
|
||||
item["text"].replace("A:", "").split("Q:")[0].strip()
|
||||
for item in completion_result["choices"]
|
||||
]
|
||||
|
||||
return result
|
||||
|
||||
|
||||
print(
|
||||
answers(
|
||||
examples=[
|
||||
["What is the capital of Washington", "Olympia"],
|
||||
["What is the capital of Oregon", "Salem"],
|
||||
],
|
||||
question="What is the capital of China?",
|
||||
examples_context="I am a bot that names country capitals",
|
||||
documents=["I am a bot that names country capitals"],
|
||||
model="davinci",
|
||||
search_model="ada",
|
||||
alternative_question="different test",
|
||||
max_tokens=16,
|
||||
stop=["\n\n"],
|
||||
)
|
||||
)
|
@ -1,302 +0,0 @@
|
||||
import itertools
|
||||
from collections import defaultdict
|
||||
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
import openai
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
|
||||
MAX_TOKENS_LIMIT = 2048
|
||||
|
||||
|
||||
def create_instruction(labels) -> str:
|
||||
"""
|
||||
Construct an instruction for a classification task.
|
||||
"""
|
||||
instruction = f"Please classify a piece of text into the following categories: {', '.join(labels)}."
|
||||
|
||||
return f"{instruction.strip()}\n\n"
|
||||
|
||||
|
||||
def semantic_search(
|
||||
search_model, query_for_search, file_id=None, max_documents=None, examples=None
|
||||
):
|
||||
"""
|
||||
:param examples: A list of {"text":...} or {"text": ..., "label": ...}.
|
||||
:return:
|
||||
a list of semantic search result dict of documents sorted by "score":
|
||||
[
|
||||
{
|
||||
"document": ...,
|
||||
"object": "search_result",
|
||||
"score": ...,
|
||||
"text": ...,
|
||||
},
|
||||
...
|
||||
]
|
||||
|
||||
"""
|
||||
assert (examples is None) ^ (file_id is None) # xor
|
||||
|
||||
if file_id is not None:
|
||||
# This is where you'd do an elastic search call. Since there isn't an example of this
|
||||
# we can query, we'll raise an error.
|
||||
# The return value from this would be a list of examples
|
||||
raise NotImplementedError()
|
||||
|
||||
# This isn't quite accurate since Search is also being deprecated. See our search guide for more
|
||||
# information.
|
||||
|
||||
search_result = openai.Search.create(
|
||||
model=search_model,
|
||||
documents=[x["text"] for x in examples],
|
||||
query=query_for_search,
|
||||
)
|
||||
|
||||
info_dict = {d["document"]: d for d in search_result["data"]}
|
||||
sorted_doc_ids = sorted(
|
||||
info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True
|
||||
)
|
||||
if max_documents:
|
||||
sorted_doc_ids = sorted_doc_ids[:max_documents]
|
||||
return [info_dict[i] for i in sorted_doc_ids]
|
||||
|
||||
|
||||
def select_by_length(
|
||||
sorted_doc_infos,
|
||||
max_token_len,
|
||||
lambda_fn=None,
|
||||
):
|
||||
"""
|
||||
Give a list of (document ID, document content in string), we will select as many
|
||||
documents as possible as long as the total length does not go above `max_token_len`.
|
||||
|
||||
:param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score".
|
||||
:param max_token_len: The maximum token length for selected documents.
|
||||
:param lambda_fn: A function that takes in search results dict and output a formatted
|
||||
example for context stuffing.
|
||||
:return: A tuple of (
|
||||
A concatenation of selected documents used as context,
|
||||
A list of selected document IDs
|
||||
)
|
||||
"""
|
||||
if not sorted_doc_infos:
|
||||
return "", []
|
||||
|
||||
selected_indices = []
|
||||
total_doc_tokens = 0
|
||||
doc_dict = {}
|
||||
for i, doc_info in enumerate(sorted_doc_infos):
|
||||
doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"]
|
||||
n_doc_tokens = len(tokenizer.encode(doc))
|
||||
if total_doc_tokens + n_doc_tokens < max_token_len:
|
||||
total_doc_tokens += n_doc_tokens
|
||||
selected_indices.append(i)
|
||||
doc_dict[i] = doc
|
||||
|
||||
# The top ranked documents should go at the end.
|
||||
selected_indices = selected_indices[::-1]
|
||||
|
||||
context = "".join([doc_dict[i] for i in selected_indices])
|
||||
selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices]
|
||||
return context, selected_doc_infos
|
||||
|
||||
|
||||
def format_example_fn(x: dict) -> str:
|
||||
return "Text: {text}\nCategory: {label}\n---\n".format(
|
||||
text=x["text"].replace("\n", " ").strip(),
|
||||
label=x["label"].replace("\n", " ").strip(),
|
||||
)
|
||||
|
||||
|
||||
def classifications(
|
||||
query,
|
||||
model,
|
||||
search_model="ada",
|
||||
examples=None,
|
||||
file=None,
|
||||
labels=None,
|
||||
temperature=0.0,
|
||||
logprobs=None,
|
||||
max_examples=200,
|
||||
logit_bias=None,
|
||||
alternative_query=None,
|
||||
max_tokens=16,
|
||||
) -> dict:
|
||||
"""
|
||||
Given a prompt, a question and a list of examples, containing (text, label) pairs,
|
||||
it selects top relevant examples to construct a prompt for few-shot classification.
|
||||
|
||||
The constructed prompt for the final completion call:
|
||||
```
|
||||
{{ an optional instruction }}
|
||||
|
||||
Text: example 1 text
|
||||
Category: example 1 label
|
||||
---
|
||||
Text: example 1 text
|
||||
Category: example 2 label
|
||||
---
|
||||
Text: question
|
||||
Category:
|
||||
```
|
||||
|
||||
The returned object has a structure like:
|
||||
{
|
||||
"label": "Happy",
|
||||
"model": "ada",
|
||||
"object": "classification",
|
||||
"selected_examples": [
|
||||
{
|
||||
"document": ..., # document index, same as in search/ results.
|
||||
"text": ...,
|
||||
"label": ...,
|
||||
},
|
||||
...
|
||||
],
|
||||
}
|
||||
"""
|
||||
|
||||
query = query.replace("\n", " ").strip()
|
||||
logit_bias = logit_bias if logit_bias else {}
|
||||
labels = labels if labels else []
|
||||
|
||||
if file is None and examples is None:
|
||||
raise Exception("Please submit at least one of `examples` or `file`.")
|
||||
if file is not None and examples is not None:
|
||||
raise Exception("Please submit only one of `examples` or `file`.")
|
||||
|
||||
instruction = create_instruction(labels)
|
||||
|
||||
query_for_search = alternative_query if alternative_query is not None else query
|
||||
|
||||
# Extract examples and example labels first.
|
||||
if file is not None:
|
||||
sorted_doc_infos = semantic_search(
|
||||
search_model,
|
||||
query_for_search,
|
||||
file_id=file,
|
||||
max_documents=max_examples,
|
||||
)
|
||||
|
||||
else:
|
||||
example_prompts = [
|
||||
format_example_fn(dict(text=x, label=y)) for x, y in examples
|
||||
]
|
||||
n_examples_tokens = [len(tokenizer.encode(x)) for x in example_prompts]
|
||||
|
||||
query_prompt = f"Text: {query}\nCategory:"
|
||||
n_instruction_tokens = len(tokenizer.encode(instruction))
|
||||
n_query_tokens = len(tokenizer.encode(query_prompt))
|
||||
|
||||
# Except all the required content, how many tokens left for context stuffing.
|
||||
leftover_token_len = MAX_TOKENS_LIMIT - (
|
||||
n_instruction_tokens + n_query_tokens + max_tokens
|
||||
)
|
||||
|
||||
# Process when `examples` are provided but no `file` is provided.
|
||||
if examples:
|
||||
if (max_examples is None or max_examples >= len(examples)) and sum(
|
||||
n_examples_tokens
|
||||
) < leftover_token_len:
|
||||
# If the total length of docs is short enough that we can add all examples, no search call.
|
||||
selected_indices = list(range(len(examples)))
|
||||
|
||||
sorted_doc_infos = [
|
||||
{"document": i, "text": examples[i][0], "label": examples[i][1]}
|
||||
for i in selected_indices
|
||||
]
|
||||
|
||||
elif max(n_examples_tokens) + n_query_tokens >= MAX_TOKENS_LIMIT:
|
||||
# If the prompt and the longest example together go above the limit:
|
||||
total_tokens = max(n_examples_tokens) + n_query_tokens
|
||||
raise Exception(
|
||||
user_message=f"The longest classification example, query and prompt together contain "
|
||||
f"{total_tokens} tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. "
|
||||
f"Please consider shortening your instruction, query or the longest example."
|
||||
)
|
||||
|
||||
else:
|
||||
# If we can add some context documents but not all of them, we should
|
||||
# query search endpoint to rank docs by score.
|
||||
sorted_doc_infos = semantic_search(
|
||||
search_model,
|
||||
query_for_search,
|
||||
examples=[{"text": x, "label": y} for x, y in examples],
|
||||
max_documents=max_examples,
|
||||
)
|
||||
|
||||
# Per label, we have a list of doc id sorted by its relevancy to the query.
|
||||
label_to_indices = defaultdict(list)
|
||||
for idx, d in enumerate(sorted_doc_infos):
|
||||
label_to_indices[d["label"]].append(idx)
|
||||
|
||||
# Do a round robin for each of the different labels, taking the best match for each label.
|
||||
label_indices = [label_to_indices[label] for label in labels]
|
||||
mixed_indices = [
|
||||
i for x in itertools.zip_longest(*label_indices) for i in x if i is not None
|
||||
]
|
||||
sorted_doc_infos = [sorted_doc_infos[i] for i in mixed_indices]
|
||||
|
||||
# Try to select as many examples as needed to fit into the context
|
||||
context, sorted_doc_infos = select_by_length(
|
||||
sorted_doc_infos,
|
||||
leftover_token_len,
|
||||
lambda_fn=format_example_fn,
|
||||
)
|
||||
|
||||
prompt = instruction + context + query_prompt
|
||||
|
||||
completion_params = {
|
||||
"engine": model,
|
||||
"prompt": prompt,
|
||||
"temperature": temperature,
|
||||
"logprobs": logprobs,
|
||||
"logit_bias": logit_bias,
|
||||
"max_tokens": max_tokens,
|
||||
"stop": "\n",
|
||||
"n": 1,
|
||||
}
|
||||
|
||||
completion_resp = openai.Completion.create(
|
||||
**completion_params,
|
||||
)
|
||||
|
||||
label = completion_resp["choices"][0]["text"]
|
||||
label = label.split("\n")[0].strip().lower().capitalize()
|
||||
if label not in labels:
|
||||
label = "Unknown"
|
||||
|
||||
result = dict(
|
||||
# TODO: Add id for object persistence.
|
||||
object="classification",
|
||||
model=completion_resp["model"],
|
||||
label=label,
|
||||
completion=completion_resp["id"],
|
||||
)
|
||||
|
||||
result["selected_examples"] = sorted_doc_infos
|
||||
|
||||
return result
|
||||
|
||||
|
||||
print(
|
||||
classifications(
|
||||
query="this is my test",
|
||||
model="davinci",
|
||||
search_model="ada",
|
||||
examples=[
|
||||
["this is my test", "davinci"],
|
||||
["this is other test", "blahblah"],
|
||||
],
|
||||
file=None,
|
||||
labels=["davinci", "blahblah"],
|
||||
temperature=0.1,
|
||||
logprobs=0,
|
||||
max_examples=200,
|
||||
logit_bias=None,
|
||||
alternative_query="different test",
|
||||
max_tokens=16,
|
||||
)
|
||||
)
|
@ -1,74 +0,0 @@
|
||||
from transformers import GPT2TokenizerFast
|
||||
|
||||
import openai
|
||||
|
||||
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
|
||||
|
||||
docs = ["test1", "asdklgjnasdv", "banana", "lord lollipop"]
|
||||
query = "apple orang asdansbdausd"
|
||||
|
||||
|
||||
def construct_context(query, document):
|
||||
return "<|endoftext|>{document}\n\n---\n\nThe above passage is related to: {query}".format(
|
||||
document=document, query=query
|
||||
)
|
||||
|
||||
|
||||
def get_score(context, query, log_probs, text_offsets) -> float:
|
||||
SCORE_MULTIPLIER = 100.0
|
||||
|
||||
log_prob = 0
|
||||
count = 0
|
||||
cutoff = len(context) - len(query)
|
||||
|
||||
for i in range(len(text_offsets) - 1, 0, -1):
|
||||
log_prob += log_probs[i]
|
||||
count += 1
|
||||
|
||||
if text_offsets[i] <= cutoff and text_offsets[i] != text_offsets[i - 1]:
|
||||
break
|
||||
|
||||
return log_prob / float(count) * SCORE_MULTIPLIER
|
||||
|
||||
|
||||
def search(query, documents, engine):
|
||||
|
||||
prompts = [construct_context(query, doc) for doc in [""] + documents]
|
||||
|
||||
resps = openai.Completion.create(
|
||||
model=engine,
|
||||
prompt=prompts,
|
||||
temperature=1.0,
|
||||
top_p=1.0,
|
||||
max_tokens=0,
|
||||
logprobs=0,
|
||||
n=1,
|
||||
echo=True,
|
||||
)
|
||||
|
||||
resps_by_index = {choice["index"]: choice for choice in resps["choices"]}
|
||||
|
||||
scores = [
|
||||
get_score(
|
||||
prompts[i],
|
||||
query,
|
||||
resps_by_index[i]["logprobs"]["token_logprobs"],
|
||||
resps_by_index[i]["logprobs"]["text_offset"],
|
||||
)
|
||||
for i in range(len(prompts))
|
||||
]
|
||||
|
||||
# Process results
|
||||
scores = [score - scores[0] for score in scores][1:]
|
||||
|
||||
return [
|
||||
{
|
||||
"object": "search_result",
|
||||
"document": document_idx,
|
||||
"score": round(score, 3),
|
||||
}
|
||||
for document_idx, score in enumerate(scores)
|
||||
]
|
||||
|
||||
|
||||
print(search(query=query, documents=docs, engine="davinci"))
|
Loading…
Reference in New Issue
Block a user