Pushing enterprise knowledge retrieval to cookbook

2024-11-08 01:10:29 +00:00 · 2023-05-11 10:38:30 +01:00 · 2023-05-11 10:38:30 +01:00 · ab66238d5e
commit ab66238d5e
parent d3d8d9742a
9 changed files with 62559 additions and 0 deletions
--- a/apps/enterprise-knowledge-retrieval/README.md
+++ b/apps/enterprise-knowledge-retrieval/README.md
@ -0,0 +1,36 @@
 # Enterprise Knowledge Retrieval
 This repo is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
 This repo contains a notebook and a basic Streamlit app:
 - `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance
 - `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
 To run the app, please follow the instructions below in the ```App``` section
 ## Notebook
 The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
 - **Setup:** Initiate variables and connect to a vector database.
 - **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
 - **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
 - **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
 - **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
 Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
 ## App
 We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or Hyde retrievals.
 You can use it by:
 - Ensuring you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
 - Setting up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
 - Activate the environment by running ```source venv/bin/activate```.
 - Install requirements by running ```pip install -r requirements.txt```.
 - Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser
 ## Limitations
 - This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
 - We introduce many areas you may optimize in the notebook, but we'll deep dive on these in separate offerings in the coming weeks.
--- a/apps/enterprise-knowledge-retrieval/assistant.py
+++ b/apps/enterprise-knowledge-retrieval/assistant.py
@ -0,0 +1,169 @@
 from langchain.agents import (
    Tool,
    AgentExecutor,
    LLMSingleActionAgent,
    AgentOutputParser,
 )
 from langchain.prompts import BaseChatPromptTemplate
 from langchain import SerpAPIWrapper, LLMChain
 from langchain.chat_models import ChatOpenAI
 from typing import List, Union
 from langchain.schema import AgentAction, AgentFinish, HumanMessage
 from langchain.memory import ConversationBufferWindowMemory
 import openai
 import re
 import streamlit as st
 from database import get_redis_results, get_redis_connection
 from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
 redis_client = get_redis_connection()
 def answer_user_question(query):
    results = get_redis_results(redis_client, query, INDEX_NAME)
    results.to_csv("results.csv")
    search_content = ""
    for x, y in results.head(3).iterrows():
        search_content += y["title"] + "\n" + y["result"] + "\n\n"
    retrieval_prepped = RETRIEVAL_PROMPT.format(
        SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
    )
    retrieval = openai.ChatCompletion.create(
        model=CHAT_MODEL,
        messages=[{"role": "user", "content": retrieval_prepped}],
        max_tokens=500,
    )
    # Response provided by GPT-3.5
    return retrieval["choices"][0]["message"]["content"]
 def answer_question_hyde(query):
    hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
    Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
    User question: {USER_QUESTION_HERE}
    Answer:"""
    hypothetical_answer = openai.ChatCompletion.create(
        model=CHAT_MODEL,
        messages=[
            {
                "role": "user",
                "content": hyde_prompt.format(USER_QUESTION_HERE=query),
            }
        ],
    )["choices"][0]["message"]["content"]
    # st.write(hypothetical_answer)
    results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
    results.to_csv("results.csv")
    search_content = ""
    for x, y in results.head(3).iterrows():
        search_content += y["title"] + "\n" + y["result"] + "\n\n"
    retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
        "SEARCH_CONTENT_HERE", search_content
    )
    retrieval = openai.ChatCompletion.create(
        model=CHAT_MODEL,
        messages=[{"role": "user", "content": retrieval_prepped}],
        max_tokens=500,
    )
    return retrieval["choices"][0]["message"]["content"]
 # Set up a prompt template
 class CustomPromptTemplate(BaseChatPromptTemplate):
    # The template to use
    template: str
    # The list of tools available
    tools: List[Tool]
    def format_messages(self, **kwargs) -> str:
        # Get the intermediate steps (AgentAction, Observation tuples)
        # Format them in a particular way
        intermediate_steps = kwargs.pop("intermediate_steps")
        thoughts = ""
        for action, observation in intermediate_steps:
            thoughts += action.log
            thoughts += f"\nObservation: {observation}\nThought: "
        # Set the agent_scratchpad variable to that value
        kwargs["agent_scratchpad"] = thoughts
        # Create a tools variable from the list of tools provided
        kwargs["tools"] = "\n".join(
            [f"{tool.name}: {tool.description}" for tool in self.tools]
        )
        # Create a list of tool names for the tools provided
        kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
        formatted = self.template.format(**kwargs)
        return [HumanMessage(content=formatted)]
 class CustomOutputParser(AgentOutputParser):
    def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
        # Check if agent should finish
        if "Final Answer:" in llm_output:
            return AgentFinish(
                # Return values is generally always a dictionary with a single `output` key
                # It is not recommended to try anything else at the moment :)
                return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
                log=llm_output,
            )
        # Parse out the action and action input
        regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
        match = re.search(regex, llm_output, re.DOTALL)
        if not match:
            raise ValueError(f"Could not parse LLM output: `{llm_output}`")
        action = match.group(1).strip()
        action_input = match.group(2)
        # Return the action and action input
        return AgentAction(
            tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
        )
 def initiate_agent(tools):
    prompt = CustomPromptTemplate(
        template=SYSTEM_PROMPT,
        tools=tools,
        # This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
        # The history template includes "history" as an input variable so we can interpolate it into the prompt
        input_variables=["input", "intermediate_steps", "history"],
    )
    # Initiate the memory with k=2 to keep the last two turns
    # Provide the memory to the agent
    memory = ConversationBufferWindowMemory(k=2)
    output_parser = CustomOutputParser()
    llm = ChatOpenAI(temperature=0)
    # LLM chain consisting of the LLM and a prompt
    llm_chain = LLMChain(llm=llm, prompt=prompt)
    tool_names = [tool.name for tool in tools]
    agent = LLMSingleActionAgent(
        llm_chain=llm_chain,
        output_parser=output_parser,
        stop=["\nObservation:"],
        allowed_tools=tool_names,
    )
    agent_executor = AgentExecutor.from_agent_and_tools(
        agent=agent, tools=tools, verbose=True, memory=memory
    )
    return agent_executor
--- a/apps/enterprise-knowledge-retrieval/chatbot.py
+++ b/apps/enterprise-knowledge-retrieval/chatbot.py
@ -0,0 +1,76 @@
 from langchain.agents import Tool
 import pandas as pd
 import streamlit as st
 from streamlit_chat import message
 from database import get_redis_connection
 from assistant import answer_user_question, initiate_agent, answer_question_hyde
 # Initialise database
 ## Initialise Redis connection
 redis_client = get_redis_connection()
 ### CHATBOT APP
 # --- GENERAL SETTINGS ---
 PAGE_TITLE: str = "Knowledge Retrieval Bot"
 PAGE_ICON: str = "🤖"
 st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
 st.title("Wiki Chatbot")
 st.subheader("Learn things - random things!")
 # Using object notation
 add_selectbox = st.sidebar.selectbox(
    "What kind of search?", ("Standard vector search", "HyDE")
 )
 # Define which tools the agent can use to answer user queries
 tools = [
    Tool(
        name="Search",
        func=answer_user_question
        if add_selectbox == "Standard vector search"
        else answer_question_hyde,
        description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
    )
 ]
 if "generated" not in st.session_state:
    st.session_state["generated"] = []
 if "past" not in st.session_state:
    st.session_state["past"] = []
 def query(question):
    response = st.session_state["chat"].ask_assistant(question)
    return response
 prompt = st.text_input("What do you want to know: ", "", key="input")
 if st.button("Submit", key="generationSubmit"):
    with st.spinner("Thinking..."):
        # Initialization
        if "agent" not in st.session_state:
            st.session_state["agent"] = initiate_agent(tools)
        response = st.session_state["agent"].run(prompt)
        st.session_state.past.append(prompt)
        st.session_state.generated.append(response)
 if len(st.session_state["generated"]) > 0:
    for i in range(len(st.session_state["generated"]) - 1, -1, -1):
        message(st.session_state["generated"][i], key=str(i))
        message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
    with st.expander("See search results"):
        results = list(pd.read_csv("results.csv")["result"])
        st.write(results)
--- a/apps/enterprise-knowledge-retrieval/config.py
+++ b/apps/enterprise-knowledge-retrieval/config.py
@ -0,0 +1,46 @@
 REDIS_HOST = "localhost"
 REDIS_PORT = "6380"
 REDIS_DB = "0"
 INDEX_NAME = "wiki-index"
 VECTOR_FIELD_NAME = "content_vector"
 CHAT_MODEL = "gpt-3.5-turbo"
 EMBEDDINGS_MODEL = "text-embedding-ada-002"
 # Set up the base template
 SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
 Accept the first answer that you are provided for the user.
 You have access to the following tools::
 {tools}
 Use the following format:
 Question: the input question you must answer
 Thought: you should always think about what to do
 Action: the action to take, should be one of [{tool_names}]
 Action Input: the input to the action
 Observation: the result of the action
 ... (this Thought/Action/Action Input/Observation can repeat N times)
 Thought: I now know the final answer
 Final Answer: the final answer to the original input question
 Begin! Remember to give detailed, informative answers
 Previous conversation history:
 {history}
 New question: {input}
 {agent_scratchpad}"""
 # Build a prompt to provide the original query, the result and ask to summarise for the user
 RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
 If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
 Search query: 
 {SEARCH_QUERY_HERE}
 Content: 
 {SEARCH_CONTENT_HERE}
 Answer:
 """
--- a/apps/enterprise-knowledge-retrieval/data/wikipedia_articles_2000.csv
+++ b/apps/enterprise-knowledge-retrieval/data/wikipedia_articles_2000.csv
--- a/apps/enterprise-knowledge-retrieval/database.py
+++ b/apps/enterprise-knowledge-retrieval/database.py
@ -0,0 +1,72 @@
 import ast
 from math import isnan
 import numpy as np
 import pandas as pd
 import openai
 from redis import Redis as r
 from redis.commands.search.query import Query
 from config import (
    REDIS_DB,
    REDIS_HOST,
    REDIS_PORT,
    VECTOR_FIELD_NAME,
    EMBEDDINGS_MODEL,
    INDEX_NAME,
 )
 def get_redis_connection():
    redis_client = r(
        host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=False
    )
    return redis_client
 # Make query to Redis
 def query_redis(redis_conn, query, index_name, top_k=5):
    ## Creates embedding vector from user query
    embedded_query = np.array(
        openai.Embedding.create(input=query, model=EMBEDDINGS_MODEL,)["data"][
            0
        ]["embedding"],
        dtype=np.float32,
    ).tobytes()
    # prepare the query
    q = (
        Query(f"*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]")
        .sort_by("vector_score")
        .paging(0, top_k)
        .return_fields("vector_score", "url", "title", "content", "text_chunk_index")
        .dialect(2)
    )
    params_dict = {"vec_param": embedded_query}
    # Execute the query
    results = redis_conn.ft(index_name).search(q, query_params=params_dict)
    return results
 # Get mapped documents from Redis results
 def get_redis_results(redis_conn, query, index_name):
    # Get most relevant documents from Redis
    query_result = query_redis(redis_conn, query, index_name)
    # Extract info into a list
    query_result_list = []
    for i, result in enumerate(query_result.docs):
        result_order = i
        url = result.url
        title = result.title
        text = result.content
        score = result.vector_score
        query_result_list.append((result_order, url, title, text, score))
    # Display result as a DataFrame for ease of us
    result_df = pd.DataFrame(query_result_list)
    result_df.columns = ["id", "url", "title", "result", "certainty"]
    return result_df
--- a/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb
+++ b/apps/enterprise-knowledge-retrieval/enterprise_knowledge_retrieval.ipynb
--- a/apps/enterprise-knowledge-retrieval/img/enterprise_knowledge_retrieval.png
+++ b/apps/enterprise-knowledge-retrieval/img/enterprise_knowledge_retrieval.png
--- a/apps/enterprise-knowledge-retrieval/requirements.txt
+++ b/apps/enterprise-knowledge-retrieval/requirements.txt
@ -0,0 +1,7 @@
 langchain==0.0.158
 numpy==1.24.2
 openai==0.27.4
 pandas==2.0.0
 redis==4.5.4
 streamlit==1.22.0
 streamlit_chat==0.0.2.2