Merge pull request #414 from openai/embedding_retrieval_deep_dive

Pushing enterprise knowledge retrieval to cookbook
1 year ago · 12527df1f9
parent 9c02c9c44c 11bc21d878
commit 12527df1f9
8 changed files with 3620 additions and 0 deletions
--- a/apps/enterprise-knowledge-retrieval/README.md
+++ b/apps/enterprise-knowledge-retrieval/README.md
@ -0,0 +1,36 @@
+# Enterprise Knowledge Retrieval
+
+This app is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
+
+This repo contains a notebook and a basic Streamlit app:
+- `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance.
+- `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
+
+To run the app, please follow the instructions below in the ```App``` section
+
+## Notebook
+
+The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
+- **Setup:** Initiate variables and connect to a vector database.
+- **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
+- **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
+- **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
+- **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
+
+Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
+
+## App
+
+We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or [HyDE](https://arxiv.org/abs/2212.10496) retrievals.
+
+To use it:
+- Ensure you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
+- Set up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
+- Activate the environment by running ```source venv/bin/activate```.
+- Install requirements by running ```pip install -r requirements.txt```.
+- Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser.
+
+## Limitations
+
+- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
+- We introduce many areas you may optimize in the notebook, but we'll deep dive on these in subsequent cookbooks.
--- a/apps/enterprise-knowledge-retrieval/assistant.py
+++ b/apps/enterprise-knowledge-retrieval/assistant.py
@ -0,0 +1,183 @@
+from langchain.agents import (
+    Tool,
+    AgentExecutor,
+    LLMSingleActionAgent,
+    AgentOutputParser,
+)
+from langchain.prompts import BaseChatPromptTemplate
+from langchain import SerpAPIWrapper, LLMChain
+from langchain.chat_models import ChatOpenAI
+from typing import List, Union
+from langchain.schema import AgentAction, AgentFinish, HumanMessage
+from langchain.memory import ConversationBufferWindowMemory
+import openai
+import re
+import streamlit as st
+
+from database import get_redis_results, get_redis_connection
+from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
+
+
+redis_client = get_redis_connection()
+
+
+def answer_user_question(query):
+
+    results = get_redis_results(redis_client, query, INDEX_NAME)
+
+    results.to_csv("results.csv")
+
+    search_content = ""
+    for x, y in results.head(3).iterrows():
+        search_content += y["title"] + "\n" + y["result"] + "\n\n"
+
+    retrieval_prepped = RETRIEVAL_PROMPT.format(
+        SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
+    )
+
+    retrieval = openai.ChatCompletion.create(
+        model=CHAT_MODEL,
+        messages=[{"role": "user", "content": retrieval_prepped}],
+        max_tokens=500,
+    )
+
+    # Response provided by GPT-3.5
+    return retrieval["choices"][0]["message"]["content"]
+
+
+def answer_question_hyde(query):
+
+    hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
+    Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
+
+    User question: {USER_QUESTION_HERE}
+    
+    Answer:"""
+
+    hypothetical_answer = openai.ChatCompletion.create(
+        model=CHAT_MODEL,
+        messages=[
+            {
+                "role": "user",
+                "content": hyde_prompt.format(USER_QUESTION_HERE=query),
+            }
+        ],
+    )["choices"][0]["message"]["content"]
+    # st.write(hypothetical_answer)
+    results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
+
+    results.to_csv("results.csv")
+
+    search_content = ""
+    for x, y in results.head(3).iterrows():
+        search_content += y["title"] + "\n" + y["result"] + "\n\n"
+
+    retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
+        "SEARCH_CONTENT_HERE", search_content
+    )
+    retrieval = openai.ChatCompletion.create(
+        model=CHAT_MODEL,
+        messages=[{"role": "user", "content": retrieval_prepped}],
+        max_tokens=500,
+    )
+
+    return retrieval["choices"][0]["message"]["content"]
+
+
+# Set up a prompt template
+class CustomPromptTemplate(BaseChatPromptTemplate):
+    # The template to use
+    template: str
+    # The list of tools available
+    tools: List[Tool]
+
+    def format_messages(self, **kwargs) -> str:
+        # Get the intermediate steps (AgentAction, Observation tuples)
+        # Format them in a particular way
+        intermediate_steps = kwargs.pop("intermediate_steps")
+        thoughts = ""
+        for action, observation in intermediate_steps:
+            thoughts += action.log
+            thoughts += f"\nObservation: {observation}\nThought: "
+        # Set the agent_scratchpad variable to that value
+        kwargs["agent_scratchpad"] = thoughts
+        # Create a tools variable from the list of tools provided
+        kwargs["tools"] = "\n".join(
+            [f"{tool.name}: {tool.description}" for tool in self.tools]
+        )
+        # Create a list of tool names for the tools provided
+        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
+        formatted = self.template.format(**kwargs)
+        return [HumanMessage(content=formatted)]
+
+
+class CustomOutputParser(AgentOutputParser):
+    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
+        # Check if agent should finish
+        if "Final Answer:" in llm_output:
+            return AgentFinish(
+                # Return values is generally always a dictionary with a single `output` key
+                # It is not recommended to try anything else at the moment :)
+                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
+                log=llm_output,
+            )
+        # Parse out the action and action input
+        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
+        match = re.search(regex, llm_output, re.DOTALL)
+        if not match:
+            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
+        action = match.group(1).strip()
+        action_input = match.group(2)
+        # Return the action and action input
+        return AgentAction(
+            tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
+        )
+
+
+def initiate_agent(tools):
+    prompt = CustomPromptTemplate(
+        template=SYSTEM_PROMPT,
+        tools=tools,
+        # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
+        # The history template includes "history" as an input variable so we can interpolate it into the prompt
+        input_variables=["input", "intermediate_steps", "history"],
+    )
+
+    # Initiate the memory with k=2 to keep the last two turns
+    # Provide the memory to the agent
+    memory = ConversationBufferWindowMemory(k=2)
+
+    output_parser = CustomOutputParser()
+
+    llm = ChatOpenAI(temperature=0)
+
+    # LLM chain consisting of the LLM and a prompt
+    llm_chain = LLMChain(llm=llm, prompt=prompt)
+
+    tool_names = [tool.name for tool in tools]
+    agent = LLMSingleActionAgent(
+        llm_chain=llm_chain,
+        output_parser=output_parser,
+        stop=["\nObservation:"],
+        allowed_tools=tool_names,
+    )
+
+    agent_executor = AgentExecutor.from_agent_and_tools(
+        agent=agent, tools=tools, verbose=True, memory=memory
+    )
+
+    return agent_executor
+
+
+def ask_gpt(query):
+    response = openai.ChatCompletion.create(
+        model=CHAT_MODEL,
+        messages=[
+            {
+                "role": "user",
+                "content": "Please answer my question.\nQuestion: {}".format(query),
+            }
+        ],
+        temperature=0,
+    )
+    return response["choices"][0]["message"]["content"]
--- a/apps/enterprise-knowledge-retrieval/chatbot.py
+++ b/apps/enterprise-knowledge-retrieval/chatbot.py
@ -0,0 +1,86 @@
+from langchain.agents import Tool
+import pandas as pd
+import streamlit as st
+from streamlit_chat import message
+
+from database import get_redis_connection
+from assistant import (
+    answer_user_question,
+    initiate_agent,
+    answer_question_hyde,
+    ask_gpt,
+)
+
+# Initialise database
+
+## Initialise Redis connection
+redis_client = get_redis_connection()
+
+
+### CHATBOT APP
+
+# --- GENERAL SETTINGS ---
+PAGE_TITLE: str = "Knowledge Retrieval Bot"
+PAGE_ICON: str = "🤖"
+
+st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
+
+st.title("Wiki Chatbot")
+st.subheader("Learn things - random things!")
+
+# Using object notation
+add_selectbox = st.sidebar.selectbox(
+    "What kind of search?", ("Standard vector search", "HyDE")
+)
+
+# Define which tools the agent can use to answer user queries
+tools = [
+    Tool(
+        name="Search",
+        func=answer_user_question
+        if add_selectbox == "Standard vector search"
+        else answer_question_hyde,
+        description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
+    ),
+    Tool(
+        name="Ask",
+        func=ask_gpt,
+        description="Useful if the question is not general knowledge. Input should be a fully formed question.",
+    ),
+]
+
+if "generated" not in st.session_state:
+    st.session_state["generated"] = []
+
+if "past" not in st.session_state:
+    st.session_state["past"] = []
+
+
+def query(question):
+    response = st.session_state["chat"].ask_assistant(question)
+    return response
+
+
+prompt = st.text_input("What do you want to know: ", "", key="input")
+
+if st.button("Submit", key="generationSubmit"):
+    with st.spinner("Thinking..."):
+        # Initialization
+        if "agent" not in st.session_state:
+            st.session_state["agent"] = initiate_agent(tools)
+
+        response = st.session_state["agent"].run(prompt)
+
+        st.session_state.past.append(prompt)
+        st.session_state.generated.append(response)
+
+if len(st.session_state["generated"]) > 0:
+    for i in range(len(st.session_state["generated"]) - 1, -1, -1):
+        message(st.session_state["generated"][i], key=str(i))
+        message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
+
+    with st.expander("See search results"):
+
+        results = list(pd.read_csv("results.csv")["result"])
+
+        st.write(results)
--- a/apps/enterprise-knowledge-retrieval/config.py
+++ b/apps/enterprise-knowledge-retrieval/config.py
@ -0,0 +1,46 @@
+REDIS_HOST = "localhost"
+REDIS_PORT = "6379"
+REDIS_DB = "0"
+INDEX_NAME = "wiki-index"
+VECTOR_FIELD_NAME = "content_vector"
+CHAT_MODEL = "gpt-3.5-turbo"
+EMBEDDINGS_MODEL = "text-embedding-ada-002"
+# Set up the base template
+SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
+Accept the first answer that you are provided for the user.
+You have access to the following tools::
+
+{tools}
+
+Use the following format:
+
+Question: the input question you must answer
+Thought: you should always think about what to do
+Action: the action to take, should be one of [{tool_names}]
+Action Input: the input to the action
+Observation: the result of the action
+... (this Thought/Action/Action Input/Observation can repeat N times)
+Thought: I now know the final answer
+Final Answer: the final answer to the original input question
+
+Begin! Remember to give detailed, informative answers
+
+Previous conversation history:
+{history}
+
+New question: {input}
+{agent_scratchpad}"""
+# Build a prompt to provide the original query, the result and ask to summarise for the user
+RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
+If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
+
+Search query: 
+
+{SEARCH_QUERY_HERE}
+
+Content: 
+
+{SEARCH_CONTENT_HERE}
+
+Answer:
+"""
--- a/apps/enterprise-knowledge-retrieval/database.py
+++ b/apps/enterprise-knowledge-retrieval/database.py
@ -0,0 +1,72 @@
+import ast
+from math import isnan
+import numpy as np
+import pandas as pd
+import openai
+from redis import Redis as r
+from redis.commands.search.query import Query
+
+from config import (
+    REDIS_DB,
+    REDIS_HOST,
+    REDIS_PORT,
+    VECTOR_FIELD_NAME,
+    EMBEDDINGS_MODEL,
+    INDEX_NAME,
+)
+
+
+def get_redis_connection():
+    redis_client = r(
+        host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=False
+    )
+    return redis_client
+
+
+# Make query to Redis
+def query_redis(redis_conn, query, index_name, top_k=5):
+
+    ## Creates embedding vector from user query
+    embedded_query = np.array(
+        openai.Embedding.create(input=query, model=EMBEDDINGS_MODEL,)["data"][
+            0
+        ]["embedding"],
+        dtype=np.float32,
+    ).tobytes()
+
+    # prepare the query
+    q = (
+        Query(f"*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]")
+        .sort_by("vector_score")
+        .paging(0, top_k)
+        .return_fields("vector_score", "url", "title", "content", "text_chunk_index")
+        .dialect(2)
+    )
+    params_dict = {"vec_param": embedded_query}
+
+    # Execute the query
+    results = redis_conn.ft(index_name).search(q, query_params=params_dict)
+
+    return results
+
+
+# Get mapped documents from Redis results
+def get_redis_results(redis_conn, query, index_name):
+
+    # Get most relevant documents from Redis
+    query_result = query_redis(redis_conn, query, index_name)
+
+    # Extract info into a list
+    query_result_list = []
+    for i, result in enumerate(query_result.docs):
+        result_order = i
+        url = result.url
+        title = result.title
+        text = result.content
+        score = result.vector_score
+        query_result_list.append((result_order, url, title, text, score))
+
+    # Display result as a DataFrame for ease of us
+    result_df = pd.DataFrame(query_result_list)
+    result_df.columns = ["id", "url", "title", "result", "certainty"]
+    return result_df
--- a/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb
+++ b/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb
--- a/apps/enterprise-knowledge-retrieval/img/enterprise_knowledge_retrieval.png
+++ b/apps/enterprise-knowledge-retrieval/img/enterprise_knowledge_retrieval.png
--- a/apps/enterprise-knowledge-retrieval/requirements.txt
+++ b/apps/enterprise-knowledge-retrieval/requirements.txt
@ -0,0 +1,7 @@
+langchain==0.0.158
+numpy==1.24.2
+openai==0.27.4
+pandas==2.0.0
+redis==4.5.4
+streamlit==1.22.0
+streamlit_chat==0.0.2.2