Remove legacy transition guides from 2021/2022 (#718)

2024-11-04 06:00:33 +00:00 · 2023-09-15 18:47:08 -05:00 · 2023-09-15 18:47:08 -05:00 · 78c6ed57ca
commit 78c6ed57ca
parent fd4e31bb00
4 changed files with 0 additions and 730 deletions
--- a/transition_guides_for_deprecated_API_endpoints/README.md
+++ b/transition_guides_for_deprecated_API_endpoints/README.md
@ -1,50 +0,0 @@
-# Deprecation of Answers, Classification, and Search
-
-In 2021, OpenAI released specialized endpoints in beta for Answers, Classification, and Search.
-
-While these specialized endpoints were convenient, they had two drawbacks:
-
-1. These specialized endpoints were eclipsed by techniques that achieved better results.
-2. These specialized endpoints were more difficult to customize and optimize for individual use cases.
-
-As a result, **the Answers, Classifications, and Search endpoints are being deprecated.**
-
-## Timeline of deprecation
-
-For those who have not used these endpoints, nothing will change except that access will no longer be available.
-
-**For existing users of these endpoints, access will continue until December 3, 2022.** Before that date, we strongly encourage developers to switch over to newer techniques which produce better results.
-
-## How to transition
-
-We've written guides and code examples for transitioning from the deprecated API endpoints to better methods.
-
-### Answers
-
-[Guide: How to transition off the Answers endpoint](https://help.openai.com/en/articles/6233728-answers-transition-guide)
-
-* Option 1: transition to embeddings-based search **(recommended)**
-  * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
-
-* Option 2: reimplement Answers endpoint functionality
-  * Example code: [answers_functionality_example.py](answers_functionality_example.py)
-
-### Classification
-
-[Guide: How to transition off the Classifications endpoint](https://help.openai.com/en/articles/6272941-classifications-transition-guide)
-
-* Option 1: transition to fine-tuning **(recommended)**
-  * Example code: [Fine-tuned_classification.ipynb](../examples/Fine-tuned_classification.ipynb)
-* Option 2: transition to embeddings
-  * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
-* Option 3: reimplement Classifications endpoint functionality
-  * Example code: [classification_functionality_example.py](classification_functionality_example.py)
-
-### Search
-
-[Guide: How to transition off the Search endpoint](https://help.openai.com/en/articles/6272952-search-transition-guide)
-
-* Option 1: transition to embeddings-based search **(recommended)**
-  * Example code: [Semantic_text_search_using_embeddings.ipynb](../examples/Semantic_text_search_using_embeddings.ipynb)
-* Option 2: reimplement Search endpoint functionality
-  * Example code: [search_functionality_example.py](search_functionality_example.py)
--- a/transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py
+++ b/transition_guides_for_deprecated_API_endpoints/answers_functionality_example.py
@ -1,304 +0,0 @@
-from transformers import GPT2TokenizerFast
-
-import openai
-
-tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-
-MAX_TOKENS_LIMIT = 2048
-ANSWERS_INSTRUCTION = "Please answer the question according to the above context.\n"
-CONTEXT_TEMPLATE = "===\nContext: {context}\n===\n"
-
-
-def extract_instruction(instruction):
-    """
-    Extract `instruction` parameter and format it properly.
-    If not exist, return empty string.
-    """
-    if instruction is None:
-        return ""
-
-    return f"{instruction.strip()}\n\n"
-
-
-def semantic_search(
-    search_model, query_for_search, file_id=None, max_documents=None, examples=None
-):
-    """
-    :param examples: A list of {"text":...} or {"text": ..., "label": ...}.
-    :return:
-        a list of semantic search result dict of documents sorted by "score":
-        [
-            {
-                "document": ...,
-                "object": "search_result",
-                "score": ...,
-                "text": ...,
-            },
-            ...
-        ]
-    """
-    assert (examples is None) ^ (file_id is None)  # xor
-
-    if file_id is not None:
-        # This is where you'd do an elastic search call.  Since there isn't an example of this
-        # we can query, we'll raise an error.
-        # The return value from this would be a list of examples
-        raise NotImplementedError()
-
-    # This isn't quite accurate since Search is also being deprecated. See our search guide for more
-    # information.
-
-    search_result = openai.Search.create(
-        model=search_model,
-        documents=[x["text"] for x in examples],
-        query=query_for_search,
-    )
-
-    info_dict = {d["document"]: d for d in search_result["data"]}
-    sorted_doc_ids = sorted(
-        info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True
-    )
-    if max_documents:
-        sorted_doc_ids = sorted_doc_ids[:max_documents]
-    return [info_dict[i] for i in sorted_doc_ids]
-
-
-def select_by_length(
-    sorted_doc_infos,
-    max_token_len,
-    lambda_fn=None,
-):
-    """
-    Give a list of (document ID, document content in string), we will select as many
-    documents as possible as long as the total length does not go above `max_token_len`.
-
-    :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score".
-    :param max_token_len: The maximum token length for selected documents.
-    :param lambda_fn: A function that takes in search results dict and output a formatted
-        example for context stuffing.
-    :return: A tuple of (
-        A concatenation of selected documents used as context,
-        A list of selected document IDs
-    )
-    """
-    if not sorted_doc_infos:
-        return "", []
-
-    selected_indices = []
-    total_doc_tokens = 0
-    doc_dict = {}
-    for i, doc_info in enumerate(sorted_doc_infos):
-        doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"]
-        n_doc_tokens = len(tokenizer.encode(doc))
-        if total_doc_tokens + n_doc_tokens < max_token_len:
-            total_doc_tokens += n_doc_tokens
-            selected_indices.append(i)
-            doc_dict[i] = doc
-
-    # The top ranked documents should go at the end.
-    selected_indices = selected_indices[::-1]
-
-    context = "".join([doc_dict[i] for i in selected_indices])
-    selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices]
-    return context, selected_doc_infos
-
-
-def answers(
-    examples,
-    question,
-    model,
-    examples_context,
-    file_id=None,
-    documents=None,
-    logit_bias=None,
-    max_rerank=200,
-    max_tokens=16,
-    alternative_question=None,
-    search_model="ada",
-    temperature=0.0,
-    logprobs=0,
-    stop=None,
-    n=1,
-):
-    """
-    Given a prompt, a question, a list of (question, answer) pairs as examples, and
-    a list of documents for context, it tries to include all the QA examples and top
-    relevant context documents.
-
-    The constructed prompt for the final completion call:
-    ```
-    Please answer the question according to the above context.
-
-    ===
-    Context: {{ the context for example QA pairs. }}
-    ===
-    Q: example 1 question
-    A: example 1 answer
-    ---
-    Q: example 2 question
-    A: example 2 answer
-    ===
-    Context: {{ a list of relevant documents sorted via search(question, documents) }}
-    ===
-    Q: question
-    A:
-    ```
-
-    The returned object has a structure like:
-    {
-      "answers": [
-        "Beijing",
-        "Beijing, China"
-      ],
-      "completion_id": "xxx-xxx",
-      "object": "answer",
-      "selected_documents": [
-        {
-            "document": ...,    # document index, same as in search/ results.
-            "object": "search_result",
-            "text": ...,
-        },
-        ...
-      ],
-    }
-    """
-
-    examples = examples if examples else []
-
-    example_prompts = [f"Q: {x}\nA: {y}" for x, y in examples]
-    prompt = f"Q: {question}\nA:"
-
-    # Append all the QA examples into the prompt.
-    if examples_context:
-        examples_context = CONTEXT_TEMPLATE.format(context=examples_context)
-    instruction = (
-        ANSWERS_INSTRUCTION + examples_context + "\n---\n".join(example_prompts) + "\n"
-    )
-
-    logit_bias = logit_bias if logit_bias is not None else {}
-
-    if file_id is None and documents is None:
-        raise Exception("Please submit at least one of `documents` or `file`.")
-    if file_id is not None and documents is not None:
-        raise Exception("Please submit only one of `documents` or `file`.")
-
-    instruction = extract_instruction(instruction)
-
-    n_instruction_tokens = len(tokenizer.encode(instruction))
-    n_prompt_tokens = len(tokenizer.encode(prompt))
-    n_query_tokens = len(tokenizer.encode(question))
-    n_context_tokens = len(tokenizer.encode(CONTEXT_TEMPLATE.format(context="")))
-
-    if documents is not None:
-        documents = [doc.strip() + " " for doc in documents]
-        n_docs_tokens = [len(tokenizer.encode(doc)) for doc in documents]
-
-    # Except all the required content, how many tokens left for context stuffing.
-    leftover_token_len = MAX_TOKENS_LIMIT - (
-        n_instruction_tokens + n_context_tokens + n_prompt_tokens + max_tokens
-    )
-    sorted_doc_infos = []
-
-    question_for_search = (
-        alternative_question if alternative_question is not None else question
-    )
-    if file_id is not None:
-        search_model_, sorted_doc_infos = semantic_search(
-            search_model,
-            question_for_search,
-            file_id=file_id,
-            max_documents=max_rerank,
-        )
-
-    elif len(documents) == 0:
-        # If no context document is provided, do nothing.
-        pass
-
-    elif min(n_docs_tokens) >= leftover_token_len:
-        # If there is no room for adding any context doc.
-        pass
-
-    elif (max_rerank is None or max_rerank >= len(documents)) and sum(
-        n_docs_tokens
-    ) < leftover_token_len:
-        # If the total length of docs is short enough to be added all.
-        selected_indices = list(range(len(documents)))
-
-        sorted_doc_infos = [
-            {"document": i, "text": documents[i]} for i in selected_indices
-        ]
-
-    elif n_query_tokens + max(n_docs_tokens) >= MAX_TOKENS_LIMIT:
-        # If the prompt and the longest document together go above the limit.
-        total_tokens = n_query_tokens + max(n_docs_tokens)
-        raise Exception(
-            f"The longest document and prompt pair together contains {total_tokens} "
-            f"tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. Please consider "
-            f"shortening the prompt or the longest document."
-        )
-
-    else:
-        # If we can add some context documents but not all of them, we should
-        # query search endpoint to rank docs by score.
-        sorted_doc_infos = semantic_search(
-            search_model,
-            question_for_search,
-            examples=[{"text": doc} for doc in documents],
-            max_documents=max_rerank,
-        )
-
-    # Select documents w.r.t. the context length limitation.
-    context, sorted_doc_infos = select_by_length(
-        sorted_doc_infos,
-        leftover_token_len,
-        lambda_fn=lambda x: x["text"].strip() + " ",
-    )
-
-    # Add instruction before the context and the prompt after the context.
-    if context:
-        context = CONTEXT_TEMPLATE.format(context=context.strip())
-    full_prompt = instruction + context + prompt
-
-    completion_result = openai.Completion.create(
-        engine=model,
-        prompt=full_prompt,
-        logit_bias=logit_bias,
-        temperature=temperature,
-        n=n,
-        max_tokens=max_tokens,
-        stop=stop,
-        logprobs=logprobs,
-    )
-
-    completion_result["selected_documents"] = sorted_doc_infos
-
-    result = dict(
-        object="answer",
-        selected_documents=completion_result.pop("selected_documents"),
-        completion=completion_result["id"],
-    )
-
-    result["answers"] = [
-        item["text"].replace("A:", "").split("Q:")[0].strip()
-        for item in completion_result["choices"]
-    ]
-
-    return result
-
-
-print(
-    answers(
-        examples=[
-            ["What is the capital of Washington", "Olympia"],
-            ["What is the capital of Oregon", "Salem"],
-        ],
-        question="What is the capital of China?",
-        examples_context="I am a bot that names country capitals",
-        documents=["I am a bot that names country capitals"],
-        model="davinci",
-        search_model="ada",
-        alternative_question="different test",
-        max_tokens=16,
-        stop=["\n\n"],
-    )
-)
--- a/transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py
+++ b/transition_guides_for_deprecated_API_endpoints/classification_functionality_example.py
@ -1,302 +0,0 @@
-import itertools
-from collections import defaultdict
-
-from transformers import GPT2TokenizerFast
-
-import openai
-
-tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-
-MAX_TOKENS_LIMIT = 2048
-
-
-def create_instruction(labels) -> str:
-    """
-    Construct an instruction for a classification task.
-    """
-    instruction = f"Please classify a piece of text into the following categories: {', '.join(labels)}."
-
-    return f"{instruction.strip()}\n\n"
-
-
-def semantic_search(
-    search_model, query_for_search, file_id=None, max_documents=None, examples=None
-):
-    """
-    :param examples: A list of {"text":...} or {"text": ..., "label": ...}.
-    :return:
-        a list of semantic search result dict of documents sorted by "score":
-        [
-            {
-                "document": ...,
-                "object": "search_result",
-                "score": ...,
-                "text": ...,
-            },
-            ...
-        ]
-
-    """
-    assert (examples is None) ^ (file_id is None)  # xor
-
-    if file_id is not None:
-        # This is where you'd do an elastic search call.  Since there isn't an example of this
-        # we can query, we'll raise an error.
-        # The return value from this would be a list of examples
-        raise NotImplementedError()
-
-    # This isn't quite accurate since Search is also being deprecated. See our search guide for more
-    # information.
-
-    search_result = openai.Search.create(
-        model=search_model,
-        documents=[x["text"] for x in examples],
-        query=query_for_search,
-    )
-
-    info_dict = {d["document"]: d for d in search_result["data"]}
-    sorted_doc_ids = sorted(
-        info_dict.keys(), key=lambda x: info_dict[x]["score"], reverse=True
-    )
-    if max_documents:
-        sorted_doc_ids = sorted_doc_ids[:max_documents]
-    return [info_dict[i] for i in sorted_doc_ids]
-
-
-def select_by_length(
-    sorted_doc_infos,
-    max_token_len,
-    lambda_fn=None,
-):
-    """
-    Give a list of (document ID, document content in string), we will select as many
-    documents as possible as long as the total length does not go above `max_token_len`.
-
-    :param sorted_doc_infos: A list of semantic search result dict of documents sorted by "score".
-    :param max_token_len: The maximum token length for selected documents.
-    :param lambda_fn: A function that takes in search results dict and output a formatted
-        example for context stuffing.
-    :return: A tuple of (
-        A concatenation of selected documents used as context,
-        A list of selected document IDs
-    )
-    """
-    if not sorted_doc_infos:
-        return "", []
-
-    selected_indices = []
-    total_doc_tokens = 0
-    doc_dict = {}
-    for i, doc_info in enumerate(sorted_doc_infos):
-        doc = lambda_fn(doc_info) if lambda_fn else doc_info["text"]
-        n_doc_tokens = len(tokenizer.encode(doc))
-        if total_doc_tokens + n_doc_tokens < max_token_len:
-            total_doc_tokens += n_doc_tokens
-            selected_indices.append(i)
-            doc_dict[i] = doc
-
-    # The top ranked documents should go at the end.
-    selected_indices = selected_indices[::-1]
-
-    context = "".join([doc_dict[i] for i in selected_indices])
-    selected_doc_infos = [sorted_doc_infos[i] for i in selected_indices]
-    return context, selected_doc_infos
-
-
-def format_example_fn(x: dict) -> str:
-    return "Text: {text}\nCategory: {label}\n---\n".format(
-        text=x["text"].replace("\n", " ").strip(),
-        label=x["label"].replace("\n", " ").strip(),
-    )
-
-
-def classifications(
-    query,
-    model,
-    search_model="ada",
-    examples=None,
-    file=None,
-    labels=None,
-    temperature=0.0,
-    logprobs=None,
-    max_examples=200,
-    logit_bias=None,
-    alternative_query=None,
-    max_tokens=16,
-) -> dict:
-    """
-    Given a prompt, a question and a list of examples, containing (text, label) pairs,
-    it selects top relevant examples to construct a prompt for few-shot classification.
-
-    The constructed prompt for the final completion call:
-    ```
-    {{ an optional instruction }}
-
-    Text: example 1 text
-    Category: example 1 label
-    ---
-    Text: example 1 text
-    Category: example 2 label
-    ---
-    Text: question
-    Category:
-    ```
-
-    The returned object has a structure like:
-    {
-      "label": "Happy",
-      "model": "ada",
-      "object": "classification",
-      "selected_examples": [
-        {
-            "document": ...,    # document index, same as in search/ results.
-            "text": ...,
-            "label": ...,
-        },
-        ...
-      ],
-    }
-    """
-
-    query = query.replace("\n", " ").strip()
-    logit_bias = logit_bias if logit_bias else {}
-    labels = labels if labels else []
-
-    if file is None and examples is None:
-        raise Exception("Please submit at least one of `examples` or `file`.")
-    if file is not None and examples is not None:
-        raise Exception("Please submit only one of `examples` or `file`.")
-
-    instruction = create_instruction(labels)
-
-    query_for_search = alternative_query if alternative_query is not None else query
-
-    # Extract examples and example labels first.
-    if file is not None:
-        sorted_doc_infos = semantic_search(
-            search_model,
-            query_for_search,
-            file_id=file,
-            max_documents=max_examples,
-        )
-
-    else:
-        example_prompts = [
-            format_example_fn(dict(text=x, label=y)) for x, y in examples
-        ]
-        n_examples_tokens = [len(tokenizer.encode(x)) for x in example_prompts]
-
-    query_prompt = f"Text: {query}\nCategory:"
-    n_instruction_tokens = len(tokenizer.encode(instruction))
-    n_query_tokens = len(tokenizer.encode(query_prompt))
-
-    # Except all the required content, how many tokens left for context stuffing.
-    leftover_token_len = MAX_TOKENS_LIMIT - (
-        n_instruction_tokens + n_query_tokens + max_tokens
-    )
-
-    # Process when `examples` are provided but no `file` is provided.
-    if examples:
-        if (max_examples is None or max_examples >= len(examples)) and sum(
-            n_examples_tokens
-        ) < leftover_token_len:
-            # If the total length of docs is short enough that we can add all examples, no search call.
-            selected_indices = list(range(len(examples)))
-
-            sorted_doc_infos = [
-                {"document": i, "text": examples[i][0], "label": examples[i][1]}
-                for i in selected_indices
-            ]
-
-        elif max(n_examples_tokens) + n_query_tokens >= MAX_TOKENS_LIMIT:
-            # If the prompt and the longest example together go above the limit:
-            total_tokens = max(n_examples_tokens) + n_query_tokens
-            raise Exception(
-                user_message=f"The longest classification example, query and prompt together contain "
-                f"{total_tokens} tokens, above the limit {MAX_TOKENS_LIMIT} for semantic search. "
-                f"Please consider shortening your instruction, query or the longest example."
-            )
-
-        else:
-            # If we can add some context documents but not all of them, we should
-            # query search endpoint to rank docs by score.
-            sorted_doc_infos = semantic_search(
-                search_model,
-                query_for_search,
-                examples=[{"text": x, "label": y} for x, y in examples],
-                max_documents=max_examples,
-            )
-
-    # Per label, we have a list of doc id sorted by its relevancy to the query.
-    label_to_indices = defaultdict(list)
-    for idx, d in enumerate(sorted_doc_infos):
-        label_to_indices[d["label"]].append(idx)
-
-    # Do a round robin for each of the different labels, taking the best match for each label.
-    label_indices = [label_to_indices[label] for label in labels]
-    mixed_indices = [
-        i for x in itertools.zip_longest(*label_indices) for i in x if i is not None
-    ]
-    sorted_doc_infos = [sorted_doc_infos[i] for i in mixed_indices]
-
-    # Try to select as many examples as needed to fit into the context
-    context, sorted_doc_infos = select_by_length(
-        sorted_doc_infos,
-        leftover_token_len,
-        lambda_fn=format_example_fn,
-    )
-
-    prompt = instruction + context + query_prompt
-
-    completion_params = {
-        "engine": model,
-        "prompt": prompt,
-        "temperature": temperature,
-        "logprobs": logprobs,
-        "logit_bias": logit_bias,
-        "max_tokens": max_tokens,
-        "stop": "\n",
-        "n": 1,
-    }
-
-    completion_resp = openai.Completion.create(
-        **completion_params,
-    )
-
-    label = completion_resp["choices"][0]["text"]
-    label = label.split("\n")[0].strip().lower().capitalize()
-    if label not in labels:
-        label = "Unknown"
-
-    result = dict(
-        # TODO: Add id for object persistence.
-        object="classification",
-        model=completion_resp["model"],
-        label=label,
-        completion=completion_resp["id"],
-    )
-
-    result["selected_examples"] = sorted_doc_infos
-
-    return result
-
-
-print(
-    classifications(
-        query="this is my test",
-        model="davinci",
-        search_model="ada",
-        examples=[
-            ["this is my test", "davinci"],
-            ["this is other test", "blahblah"],
-        ],
-        file=None,
-        labels=["davinci", "blahblah"],
-        temperature=0.1,
-        logprobs=0,
-        max_examples=200,
-        logit_bias=None,
-        alternative_query="different test",
-        max_tokens=16,
-    )
-)
--- a/transition_guides_for_deprecated_API_endpoints/search_functionality_example.py
+++ b/transition_guides_for_deprecated_API_endpoints/search_functionality_example.py
@ -1,74 +0,0 @@
-from transformers import GPT2TokenizerFast
-
-import openai
-
-tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-
-docs = ["test1", "asdklgjnasdv", "banana", "lord lollipop"]
-query = "apple orang asdansbdausd"
-
-
-def construct_context(query, document):
-    return "<|endoftext|>{document}\n\n---\n\nThe above passage is related to: {query}".format(
-        document=document, query=query
-    )
-
-
-def get_score(context, query, log_probs, text_offsets) -> float:
-    SCORE_MULTIPLIER = 100.0
-
-    log_prob = 0
-    count = 0
-    cutoff = len(context) - len(query)
-
-    for i in range(len(text_offsets) - 1, 0, -1):
-        log_prob += log_probs[i]
-        count += 1
-
-        if text_offsets[i] <= cutoff and text_offsets[i] != text_offsets[i - 1]:
-            break
-
-    return log_prob / float(count) * SCORE_MULTIPLIER
-
-
-def search(query, documents, engine):
-
-    prompts = [construct_context(query, doc) for doc in [""] + documents]
-
-    resps = openai.Completion.create(
-        model=engine,
-        prompt=prompts,
-        temperature=1.0,
-        top_p=1.0,
-        max_tokens=0,
-        logprobs=0,
-        n=1,
-        echo=True,
-    )
-
-    resps_by_index = {choice["index"]: choice for choice in resps["choices"]}
-
-    scores = [
-        get_score(
-            prompts[i],
-            query,
-            resps_by_index[i]["logprobs"]["token_logprobs"],
-            resps_by_index[i]["logprobs"]["text_offset"],
-        )
-        for i in range(len(prompts))
-    ]
-
-    # Process results
-    scores = [score - scores[0] for score in scores][1:]
-
-    return [
-        {
-            "object": "search_result",
-            "document": document_idx,
-            "score": round(score, 3),
-        }
-        for document_idx, score in enumerate(scores)
-    ]
-
-
-print(search(query=query, documents=docs, engine="davinci"))