From 78c6ed57cad6255a21e7e221cae9aa493de8f8d1 Mon Sep 17 00:00:00 2001 From: Will DePue Date: Fri, 15 Sep 2023 18:47:08 -0500 Subject: [PATCH] Remove legacy transition guides from 2021/2022 (#718) --- .../README.md | 50 --- .../answers_functionality_example.py | 304 ------------------ .../classification_functionality_example.py | 302 ----------------- .../search_functionality_example.py | 74 ----- 4 files changed, 730 deletions(-) delete mode 100644 transition_guides_for_deprecated_API_endpoints/README.md delete mode 100644 transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py delete mode 100644 transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py delete mode 100644 transition_guides_for_deprecated_API_endpoints/search_functionality_example.py diff --git a/transition_guides_for_deprecated_API_endpoints/README.md b/transition_guides_for_deprecated_API_endpoints/README.md deleted file mode 100644 index fda6b96d..00000000 --- a/transition_guides_for_deprecated_API_endpoints/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# Deprecation of Answers, Classification, and Search - -In 2021, OpenAI released specialized endpoints in beta for Answers, Classification, and Search. - -While these specialized endpoints were convenient, they had two drawbacks: - -1. These specialized endpoints were eclipsed by techniques that achieved better results. -2. These specialized endpoints were more difficult to customize and optimize for individual use cases. - -As a result, **the Answers, Classifications, and Search endpoints are being deprecated.** - -## Timeline of deprecation - -For those who have not used these endpoints, nothing will change except that access will no longer be available. - -**For existing users of these endpoints, access will continue until December 3, 2022.** Before that date, we strongly encourage developers to switch over to newer techniques which produce better results. - -## How to transition - -We've written guides and code examples for transitioning from the deprecated API endpoints to better methods. - -### Answers - -[Guide: How to transition off the Answers endpoint](https://help.openai.com/en/articles/6233728-answers-transition-guide) - -* Option 1: transition to embeddings-based search **(recommended)** - * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb) - -* Option 2: reimplement Answers endpoint functionality - * Example code: [answers_functionality_example.py](answers_functionality_example.py) - -### Classification - -[Guide: How to transition off the Classifications endpoint](https://help.openai.com/en/articles/6272941-classifications-transition-guide) - -* Option 1: transition to fine-tuning **(recommended)** - * Example code: [Fine-tuned_classification.ipynb](../examples/Fine-tuned_classification.ipynb) -* Option 2: transition to embeddings - * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb) -* Option 3: reimplement Classifications endpoint functionality - * Example code: [classification_functionality_example.py](classification_functionality_example.py) - -### Search - -[Guide: How to transition off the Search endpoint](https://help.openai.com/en/articles/6272952-search-transition-guide) - -* Option 1: transition to embeddings-based search **(recommended)** - * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb) -* Option 2: reimplement Search endpoint functionality - * Example code: [search_functionality_example.py](search_functionality_example.py) diff --git a/transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py b/transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py deleted file mode 100644 index 7c69a442..00000000 --- a/transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py +++ /dev/null @@ -1,304 +0,0 @@ -from transformers import GPT2TokenizerFast - -import openai - -tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - -MAX_TOKENS_LIMIT = 2048 -ANSWERS_INSTRUCTION = "Please answer the question according to the above context.\n" -CONTEXT_TEMPLATE = "===\nContext: {context}\n===\n" - - -def extract_instruction(instruction): - """ - Extract `instruction` parameter and format it properly. - If not exist, return empty string. - """ - if instruction is None: - return "" - - return f"{instruction.strip()}\n\n" - - -def semantic_search( - search_model, query_for_search, file_id=None, max_documents=None, examples=None -): - """ - :param examples: A list of {"text":...} or {"text": ..., "label": ...}. - :return: - a list of semantic search result dict of documents sorted by "score": - [ - { - "document": ..., - "object": "search_result", - "score": ..., - "text": ..., - }, - ... - ] - """ - assert (examples is None) ^ (file_id is None) # xor - - if file_id is not None: - # This is where you'd do an elastic search call. Since there isn't an example of this - # we can query, we'll raise an error. - # The return value from this would be a list of examples - raise NotImplementedError() - - # This isn't quite accurate since Search is also being deprecated. See our search guide for more - # information. - - search_result = openai.Search.create( - model=search_model, - documents=[x["text"] for x in examples], - query=query_for_search, - ) - - info_dict = {d["document"]: d for d in search_result["data"]} - sorted_doc_ids = sorted( - info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True - ) - if max_documents: - sorted_doc_ids = sorted_doc_ids[:max_documents] - return [info_dict[i] for i in sorted_doc_ids] - - -def select_by_length( - sorted_doc_infos, - max_token_len, - lambda_fn=None, -): - """ - Give a list of (document ID, document content in string), we will select as many - documents as possible as long as the total length does not go above `max_token_len`. - - :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score". - :param max_token_len: The maximum token length for selected documents. - :param lambda_fn: A function that takes in search results dict and output a formatted - example for context stuffing. - :return: A tuple of ( - A concatenation of selected documents used as context, - A list of selected document IDs - ) - """ - if not sorted_doc_infos: - return "", [] - - selected_indices = [] - total_doc_tokens = 0 - doc_dict = {} - for i, doc_info in enumerate(sorted_doc_infos): - doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"] - n_doc_tokens = len(tokenizer.encode(doc)) - if total_doc_tokens + n_doc_tokens < max_token_len: - total_doc_tokens += n_doc_tokens - selected_indices.append(i) - doc_dict[i] = doc - - # The top ranked documents should go at the end. - selected_indices = selected_indices[::-1] - - context = "".join([doc_dict[i] for i in selected_indices]) - selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices] - return context, selected_doc_infos - - -def answers( - examples, - question, - model, - examples_context, - file_id=None, - documents=None, - logit_bias=None, - max_rerank=200, - max_tokens=16, - alternative_question=None, - search_model="ada", - temperature=0.0, - logprobs=0, - stop=None, - n=1, -): - """ - Given a prompt, a question, a list of (question, answer) pairs as examples, and - a list of documents for context, it tries to include all the QA examples and top - relevant context documents. - - The constructed prompt for the final completion call: - ``` - Please answer the question according to the above context. - - === - Context: {{ the context for example QA pairs. }} - === - Q: example 1 question - A: example 1 answer - --- - Q: example 2 question - A: example 2 answer - === - Context: {{ a list of relevant documents sorted via search(question, documents) }} - === - Q: question - A: - ``` - - The returned object has a structure like: - { - "answers": [ - "Beijing", - "Beijing, China" - ], - "completion_id": "xxx-xxx", - "object": "answer", - "selected_documents": [ - { - "document": ..., # document index, same as in search/ results. - "object": "search_result", - "text": ..., - }, - ... - ], - } - """ - - examples = examples if examples else [] - - example_prompts = [f"Q: {x}\nA: {y}" for x, y in examples] - prompt = f"Q: {question}\nA:" - - # Append all the QA examples into the prompt. - if examples_context: - examples_context = CONTEXT_TEMPLATE.format(context=examples_context) - instruction = ( - ANSWERS_INSTRUCTION + examples_context + "\n---\n".join(example_prompts) + "\n" - ) - - logit_bias = logit_bias if logit_bias is not None else {} - - if file_id is None and documents is None: - raise Exception("Please submit at least one of `documents` or `file`.") - if file_id is not None and documents is not None: - raise Exception("Please submit only one of `documents` or `file`.") - - instruction = extract_instruction(instruction) - - n_instruction_tokens = len(tokenizer.encode(instruction)) - n_prompt_tokens = len(tokenizer.encode(prompt)) - n_query_tokens = len(tokenizer.encode(question)) - n_context_tokens = len(tokenizer.encode(CONTEXT_TEMPLATE.format(context=""))) - - if documents is not None: - documents = [doc.strip() + " " for doc in documents] - n_docs_tokens = [len(tokenizer.encode(doc)) for doc in documents] - - # Except all the required content, how many tokens left for context stuffing. - leftover_token_len = MAX_TOKENS_LIMIT - ( - n_instruction_tokens + n_context_tokens + n_prompt_tokens + max_tokens - ) - sorted_doc_infos = [] - - question_for_search = ( - alternative_question if alternative_question is not None else question - ) - if file_id is not None: - search_model_, sorted_doc_infos = semantic_search( - search_model, - question_for_search, - file_id=file_id, - max_documents=max_rerank, - ) - - elif len(documents) == 0: - # If no context document is provided, do nothing. - pass - - elif min(n_docs_tokens) >= leftover_token_len: - # If there is no room for adding any context doc. - pass - - elif (max_rerank is None or max_rerank >= len(documents)) and sum( - n_docs_tokens - ) < leftover_token_len: - # If the total length of docs is short enough to be added all. - selected_indices = list(range(len(documents))) - - sorted_doc_infos = [ - {"document": i, "text": documents[i]} for i in selected_indices - ] - - elif n_query_tokens + max(n_docs_tokens) >= MAX_TOKENS_LIMIT: - # If the prompt and the longest document together go above the limit. - total_tokens = n_query_tokens + max(n_docs_tokens) - raise Exception( - f"The longest document and prompt pair together contains {total_tokens} " - f"tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. Please consider " - f"shortening the prompt or the longest document." - ) - - else: - # If we can add some context documents but not all of them, we should - # query search endpoint to rank docs by score. - sorted_doc_infos = semantic_search( - search_model, - question_for_search, - examples=[{"text": doc} for doc in documents], - max_documents=max_rerank, - ) - - # Select documents w.r.t. the context length limitation. - context, sorted_doc_infos = select_by_length( - sorted_doc_infos, - leftover_token_len, - lambda_fn=lambda x: x["text"].strip() + " ", - ) - - # Add instruction before the context and the prompt after the context. - if context: - context = CONTEXT_TEMPLATE.format(context=context.strip()) - full_prompt = instruction + context + prompt - - completion_result = openai.Completion.create( - engine=model, - prompt=full_prompt, - logit_bias=logit_bias, - temperature=temperature, - n=n, - max_tokens=max_tokens, - stop=stop, - logprobs=logprobs, - ) - - completion_result["selected_documents"] = sorted_doc_infos - - result = dict( - object="answer", - selected_documents=completion_result.pop("selected_documents"), - completion=completion_result["id"], - ) - - result["answers"] = [ - item["text"].replace("A:", "").split("Q:")[0].strip() - for item in completion_result["choices"] - ] - - return result - - -print( - answers( - examples=[ - ["What is the capital of Washington", "Olympia"], - ["What is the capital of Oregon", "Salem"], - ], - question="What is the capital of China?", - examples_context="I am a bot that names country capitals", - documents=["I am a bot that names country capitals"], - model="davinci", - search_model="ada", - alternative_question="different test", - max_tokens=16, - stop=["\n\n"], - ) -) diff --git a/transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py b/transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py deleted file mode 100644 index 8a7fc3c8..00000000 --- a/transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py +++ /dev/null @@ -1,302 +0,0 @@ -import itertools -from collections import defaultdict - -from transformers import GPT2TokenizerFast - -import openai - -tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - -MAX_TOKENS_LIMIT = 2048 - - -def create_instruction(labels) -> str: - """ - Construct an instruction for a classification task. - """ - instruction = f"Please classify a piece of text into the following categories: {', '.join(labels)}." - - return f"{instruction.strip()}\n\n" - - -def semantic_search( - search_model, query_for_search, file_id=None, max_documents=None, examples=None -): - """ - :param examples: A list of {"text":...} or {"text": ..., "label": ...}. - :return: - a list of semantic search result dict of documents sorted by "score": - [ - { - "document": ..., - "object": "search_result", - "score": ..., - "text": ..., - }, - ... - ] - - """ - assert (examples is None) ^ (file_id is None) # xor - - if file_id is not None: - # This is where you'd do an elastic search call. Since there isn't an example of this - # we can query, we'll raise an error. - # The return value from this would be a list of examples - raise NotImplementedError() - - # This isn't quite accurate since Search is also being deprecated. See our search guide for more - # information. - - search_result = openai.Search.create( - model=search_model, - documents=[x["text"] for x in examples], - query=query_for_search, - ) - - info_dict = {d["document"]: d for d in search_result["data"]} - sorted_doc_ids = sorted( - info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True - ) - if max_documents: - sorted_doc_ids = sorted_doc_ids[:max_documents] - return [info_dict[i] for i in sorted_doc_ids] - - -def select_by_length( - sorted_doc_infos, - max_token_len, - lambda_fn=None, -): - """ - Give a list of (document ID, document content in string), we will select as many - documents as possible as long as the total length does not go above `max_token_len`. - - :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score". - :param max_token_len: The maximum token length for selected documents. - :param lambda_fn: A function that takes in search results dict and output a formatted - example for context stuffing. - :return: A tuple of ( - A concatenation of selected documents used as context, - A list of selected document IDs - ) - """ - if not sorted_doc_infos: - return "", [] - - selected_indices = [] - total_doc_tokens = 0 - doc_dict = {} - for i, doc_info in enumerate(sorted_doc_infos): - doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"] - n_doc_tokens = len(tokenizer.encode(doc)) - if total_doc_tokens + n_doc_tokens < max_token_len: - total_doc_tokens += n_doc_tokens - selected_indices.append(i) - doc_dict[i] = doc - - # The top ranked documents should go at the end. - selected_indices = selected_indices[::-1] - - context = "".join([doc_dict[i] for i in selected_indices]) - selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices] - return context, selected_doc_infos - - -def format_example_fn(x: dict) -> str: - return "Text: {text}\nCategory: {label}\n---\n".format( - text=x["text"].replace("\n", " ").strip(), - label=x["label"].replace("\n", " ").strip(), - ) - - -def classifications( - query, - model, - search_model="ada", - examples=None, - file=None, - labels=None, - temperature=0.0, - logprobs=None, - max_examples=200, - logit_bias=None, - alternative_query=None, - max_tokens=16, -) -> dict: - """ - Given a prompt, a question and a list of examples, containing (text, label) pairs, - it selects top relevant examples to construct a prompt for few-shot classification. - - The constructed prompt for the final completion call: - ``` - {{ an optional instruction }} - - Text: example 1 text - Category: example 1 label - --- - Text: example 1 text - Category: example 2 label - --- - Text: question - Category: - ``` - - The returned object has a structure like: - { - "label": "Happy", - "model": "ada", - "object": "classification", - "selected_examples": [ - { - "document": ..., # document index, same as in search/ results. - "text": ..., - "label": ..., - }, - ... - ], - } - """ - - query = query.replace("\n", " ").strip() - logit_bias = logit_bias if logit_bias else {} - labels = labels if labels else [] - - if file is None and examples is None: - raise Exception("Please submit at least one of `examples` or `file`.") - if file is not None and examples is not None: - raise Exception("Please submit only one of `examples` or `file`.") - - instruction = create_instruction(labels) - - query_for_search = alternative_query if alternative_query is not None else query - - # Extract examples and example labels first. - if file is not None: - sorted_doc_infos = semantic_search( - search_model, - query_for_search, - file_id=file, - max_documents=max_examples, - ) - - else: - example_prompts = [ - format_example_fn(dict(text=x, label=y)) for x, y in examples - ] - n_examples_tokens = [len(tokenizer.encode(x)) for x in example_prompts] - - query_prompt = f"Text: {query}\nCategory:" - n_instruction_tokens = len(tokenizer.encode(instruction)) - n_query_tokens = len(tokenizer.encode(query_prompt)) - - # Except all the required content, how many tokens left for context stuffing. - leftover_token_len = MAX_TOKENS_LIMIT - ( - n_instruction_tokens + n_query_tokens + max_tokens - ) - - # Process when `examples` are provided but no `file` is provided. - if examples: - if (max_examples is None or max_examples >= len(examples)) and sum( - n_examples_tokens - ) < leftover_token_len: - # If the total length of docs is short enough that we can add all examples, no search call. - selected_indices = list(range(len(examples))) - - sorted_doc_infos = [ - {"document": i, "text": examples[i][0], "label": examples[i][1]} - for i in selected_indices - ] - - elif max(n_examples_tokens) + n_query_tokens >= MAX_TOKENS_LIMIT: - # If the prompt and the longest example together go above the limit: - total_tokens = max(n_examples_tokens) + n_query_tokens - raise Exception( - user_message=f"The longest classification example, query and prompt together contain " - f"{total_tokens} tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. " - f"Please consider shortening your instruction, query or the longest example." - ) - - else: - # If we can add some context documents but not all of them, we should - # query search endpoint to rank docs by score. - sorted_doc_infos = semantic_search( - search_model, - query_for_search, - examples=[{"text": x, "label": y} for x, y in examples], - max_documents=max_examples, - ) - - # Per label, we have a list of doc id sorted by its relevancy to the query. - label_to_indices = defaultdict(list) - for idx, d in enumerate(sorted_doc_infos): - label_to_indices[d["label"]].append(idx) - - # Do a round robin for each of the different labels, taking the best match for each label. - label_indices = [label_to_indices[label] for label in labels] - mixed_indices = [ - i for x in itertools.zip_longest(*label_indices) for i in x if i is not None - ] - sorted_doc_infos = [sorted_doc_infos[i] for i in mixed_indices] - - # Try to select as many examples as needed to fit into the context - context, sorted_doc_infos = select_by_length( - sorted_doc_infos, - leftover_token_len, - lambda_fn=format_example_fn, - ) - - prompt = instruction + context + query_prompt - - completion_params = { - "engine": model, - "prompt": prompt, - "temperature": temperature, - "logprobs": logprobs, - "logit_bias": logit_bias, - "max_tokens": max_tokens, - "stop": "\n", - "n": 1, - } - - completion_resp = openai.Completion.create( - **completion_params, - ) - - label = completion_resp["choices"][0]["text"] - label = label.split("\n")[0].strip().lower().capitalize() - if label not in labels: - label = "Unknown" - - result = dict( - # TODO: Add id for object persistence. - object="classification", - model=completion_resp["model"], - label=label, - completion=completion_resp["id"], - ) - - result["selected_examples"] = sorted_doc_infos - - return result - - -print( - classifications( - query="this is my test", - model="davinci", - search_model="ada", - examples=[ - ["this is my test", "davinci"], - ["this is other test", "blahblah"], - ], - file=None, - labels=["davinci", "blahblah"], - temperature=0.1, - logprobs=0, - max_examples=200, - logit_bias=None, - alternative_query="different test", - max_tokens=16, - ) -) diff --git a/transition_guides_for_deprecated_API_endpoints/search_functionality_example.py b/transition_guides_for_deprecated_API_endpoints/search_functionality_example.py deleted file mode 100644 index 6c9c9e97..00000000 --- a/transition_guides_for_deprecated_API_endpoints/search_functionality_example.py +++ /dev/null @@ -1,74 +0,0 @@ -from transformers import GPT2TokenizerFast - -import openai - -tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") - -docs = ["test1", "asdklgjnasdv", "banana", "lord lollipop"] -query = "apple orang asdansbdausd" - - -def construct_context(query, document): - return "<|endoftext|>{document}\n\n---\n\nThe above passage is related to: {query}".format( - document=document, query=query - ) - - -def get_score(context, query, log_probs, text_offsets) -> float: - SCORE_MULTIPLIER = 100.0 - - log_prob = 0 - count = 0 - cutoff = len(context) - len(query) - - for i in range(len(text_offsets) - 1, 0, -1): - log_prob += log_probs[i] - count += 1 - - if text_offsets[i] <= cutoff and text_offsets[i] != text_offsets[i - 1]: - break - - return log_prob / float(count) * SCORE_MULTIPLIER - - -def search(query, documents, engine): - - prompts = [construct_context(query, doc) for doc in [""] + documents] - - resps = openai.Completion.create( - model=engine, - prompt=prompts, - temperature=1.0, - top_p=1.0, - max_tokens=0, - logprobs=0, - n=1, - echo=True, - ) - - resps_by_index = {choice["index"]: choice for choice in resps["choices"]} - - scores = [ - get_score( - prompts[i], - query, - resps_by_index[i]["logprobs"]["token_logprobs"], - resps_by_index[i]["logprobs"]["text_offset"], - ) - for i in range(len(prompts)) - ] - - # Process results - scores = [score - scores[0] for score in scores][1:] - - return [ - { - "object": "search_result", - "document": document_idx, - "score": round(score, 3), - } - for document_idx, score in enumerate(scores) - ] - - -print(search(query=query, documents=docs, engine="davinci"))