diff --git a/README.md b/README.md index 436a3b29..9992b412 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Most code examples are written in Python, though the concepts can be applied in ## Recently added/updated 🆕 ✨ +- [Embeddings playground (streamlit app)](apps/embeddings-playground/README.md) [May 19th, 2023] - [How to use a multi-step prompt to write unit tests](examples/Unit_test_writing_using_a_multi-step_prompt.ipynb) [May 19, 2023] - [How to create dynamic masks with DALL·E and Segment Anything](examples/dalle/How_to_create_dynamic_masks_with_DALL-E_and_Segment_Anything.ipynb) [May 19th, 2023] - [Question answering using embeddings](examples/Question_answering_using_embeddings.ipynb) [Apr 14th, 2023] @@ -36,6 +37,7 @@ Most code examples are written in Python, though the concepts can be applied in - [Clustering embeddings](examples/Clustering.ipynb) - [Visualizing embeddings in 2D](examples/Visualizing_embeddings_in_2D.ipynb) or [3D](examples/Visualizing_embeddings_in_3D.ipynb) - [Embedding long texts](examples/Embedding_long_inputs.ipynb) + - [Embeddings playground (streamlit app)](apps/embeddings-playground/README.md) [May 19th, 2023] - Apps - [File Q&A](apps/file-q-and-a/) - [Web Crawl Q&A](apps/web-crawl-q-and-a) diff --git a/apps/embeddings-playground/README.md b/apps/embeddings-playground/README.md new file mode 100644 index 00000000..336a2cde --- /dev/null +++ b/apps/embeddings-playground/README.md @@ -0,0 +1,42 @@ +# Embeddings Playground + +[`embeddings_playground.py`](embeddings_playground.py) is a single-page streamlit app for experimenting with OpenAI embeddings. + +## Installation + +Before running, install required dependencies with: + +`pip install -r examples/apps/embeddings_playground/requirements.txt` + +(You may need to change the path to match your local path.) + +Verify installation of streamlit with `streamlit hello`. + +## Usage + +Run the script with: + +`streamlit run examples/apps/embeddings_playground.py` + +(Again, you may need to change the path to match your local path.) + +In the app, first select your choice of: +- distance metric (we recommend cosine) +- embedding model (we recommend `text-embedding-ada-002` for most use cases, as of May 2023) + +Then, enter a variable number of strings to compare. Click `rank` to see: +- the ranked list of strings, sorted by distance from the first string +- a heatmap showing the distance between each pair of strings + +## Example + +Here's an example distance matrix for 8 example strings related to `The sky is blue`: + +![example distance matrix](example_distance_matrix.png) + +From these distance pairs, you can see: +- embeddings measure topical similarity more than logical similarity (e.g., `The sky is blue` is very close to `The sky is not blue`) +- punctuation affects embeddings (e.g., `"THE. SKY. IS. BLUE!"` is only third closest to `The sky is blue`) +- within-language pairs are stronger than across-language pairs (e.g., `El cielo as azul` is closer to `El cielo es rojo` than to `The sky is blue`) + +Experiment with your own strings to see what you can learn. \ No newline at end of file diff --git a/apps/embeddings-playground/embeddings_playground.py b/apps/embeddings-playground/embeddings_playground.py new file mode 100644 index 00000000..86890287 --- /dev/null +++ b/apps/embeddings-playground/embeddings_playground.py @@ -0,0 +1,178 @@ +""" +EMBEDDINGS PLAYGROUND + +This is a single-page streamlit app for experimenting with OpenAI embeddings. + +Before running, install required dependencies with: + +`pip install -r apps/embeddings-playground/requirements.txt` + +You may need to change the path to match your local path. + +Verify installation of streamlit with `streamlit hello`. + +Run this script with: + +`streamlit run apps/embeddings-playground/embeddings_playground.py` + +Again, you may need to change the path to match your local path. +""" + +# IMPORTS +import altair as alt +import openai +import os +import pandas as pd +from scipy import spatial +import streamlit as st +from tenacity import ( + retry, + stop_after_attempt, + wait_random_exponential, +) + +# FUNCTIONS + +# get embeddings +@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6)) +@st.cache_data +def embedding_from_string(input: str, model: str) -> list: + response = openai.Embedding.create(input=input, model=model) + embedding = response["data"][0]["embedding"] + return embedding + + +# plot distance matrix +def plot_distance_matrix(strings: list, engine: str, distance: str): + # create dataframe of embedding distances + df = pd.DataFrame({"string": strings, "index": range(len(strings))}) + df["embedding"] = df["string"].apply(lambda string: embedding_from_string(string, engine)) + df["string"] = df.apply(lambda row: f"({row['index'] + 1}) {row['string']}", axis=1) + df["dummy_key"] = 0 + df = pd.merge(df, df, on="dummy_key", suffixes=("_1", "_2")).drop("dummy_key", axis=1) + df = df[df["string_1"] != df["string_2"]] # filter out diagonal (always 0) + df["distance"] = df.apply( + lambda row: distance_metrics[distance](row["embedding_1"], row["embedding_2"]), + axis=1, + ) + df["label"] = df["distance"].apply(lambda d: f"{d:.2f}") + + # set chart params + text_size = 32 + label_size = 16 + pixels_per_string = 80 # aka row height & column width (perpendicular to text) + max_label_width = 256 # in pixels, not characters, I think? + chart_width = ( + 50 + + min(max_label_width, max(df["string_1"].apply(len) * label_size/2)) + + len(strings) * pixels_per_string + ) + + # extract chart parameters from data + color_min = df["distance"].min() + color_max = 1.5 * df["distance"].max() + x_order = df["string_1"].values + ranked = False + if ranked: + ranked_df = df[(df["string_1"] == f"(1) {strings[0]}")].sort_values(by="distance") + y_order = ranked_df["string_2"].values + else: + y_order = x_order + + # create chart + boxes = ( + alt.Chart(df, title=f"{engine}") + .mark_rect() + .encode( + x=alt.X("string_1", title=None, sort=x_order), + y=alt.Y("string_2", title=None, sort=y_order), + color=alt.Color("distance:Q", title=f"{distance} distance", scale=alt.Scale(domain=[color_min,color_max], scheme="darkblue", reverse=True)), + ) + ) + + labels = ( + boxes.mark_text(align="center", baseline="middle", fontSize=text_size) + .encode(text="label") + .configure_axis(labelLimit=max_label_width, labelFontSize=label_size) + .properties(width=chart_width, height=chart_width) + ) + + st.altair_chart(labels) # note: layered plots are not supported in streamlit :( + + +# PAGE + +st.title("OpenAI Embeddings Playground") + +# get API key +try: + openai.api_key = os.getenv("OPENAI_API_KEY") + st.write(f"API key sucessfully retrieved: {openai.api_key[:3]}...{openai.api_key[-4:]}") +except: + st.header("Enter API Key") + openai.api_key = st.text_input("API key") + +# select distance metric +st.header("Select distance metric") +distance_metrics = { + "cosine": spatial.distance.cosine, + "L1 (cityblock)": spatial.distance.cityblock, + "L2 (euclidean)": spatial.distance.euclidean, + "Linf (chebyshev)": spatial.distance.chebyshev, + #'correlation': spatial.distance.correlation, # not sure this makes sense for individual vectors - looks like cosine +} +distance_metric_options = list(distance_metrics.keys()) +distance = st.radio("Distance metric", distance_metric_options) + +# select models +st.header("Select models") +models = [ + "text-embedding-ada-002", + "text-similarity-ada-001", + "text-similarity-babbage-001", + "text-similarity-curie-001", + "text-similarity-davinci-001", +] +prechecked_models = [ + "text-embedding-ada-002" +] +model_values = [st.checkbox(model, key=model, value=(model in prechecked_models)) for model in models] + +# enter strings +st.header("Enter strings") +strings = [] +if "num_boxes" not in st.session_state: + st.session_state.num_boxes = 5 +if st.session_state.num_boxes > 2: + if st.button("Remove last text box"): + st.session_state.num_boxes -= 1 +if st.button("Add new text box"): + st.session_state.num_boxes += 1 +for i in range(st.session_state.num_boxes): + string = st.text_input(f"String {i+1}") + strings.append(string) + +# rank strings +st.header("Rank strings by relatedness") +if st.button("Rank"): + # display a dataframe comparing rankings to string #1 + st.subheader("Rankings") + ranked_strings = {} + for model, value in zip(models, model_values): + if value: + query_embedding = embedding_from_string(strings[0], model) + df = pd.DataFrame({"string": strings}) + df[model] = df["string"].apply(lambda string: embedding_from_string(string, model)) + df["distance"] = df[model].apply( + lambda embedding: distance_metrics[distance](query_embedding, embedding) + ) + df = df.sort_values(by="distance") + ranked_strings[model] = df["string"].values + df = pd.DataFrame(ranked_strings) + st.dataframe(df) + + # display charts of all the pairwise distances between strings + st.subheader("Distance matrices") + for model, value in zip(models, model_values): + if value: + plot_distance_matrix(strings, model, distance) diff --git a/apps/embeddings-playground/example_distance_matrix.png b/apps/embeddings-playground/example_distance_matrix.png new file mode 100644 index 00000000..741d0d7c Binary files /dev/null and b/apps/embeddings-playground/example_distance_matrix.png differ diff --git a/apps/embeddings-playground/requirements.txt b/apps/embeddings-playground/requirements.txt new file mode 100644 index 00000000..c2326224 --- /dev/null +++ b/apps/embeddings-playground/requirements.txt @@ -0,0 +1,6 @@ +altair +openai +pandas +scipy +streamlit +tenacity \ No newline at end of file