Pushing enterprise knowledge retrieval to cookbook

This commit is contained in:
Colin Jarvis 2023-05-11 10:38:30 +01:00
parent d3d8d9742a
commit ab66238d5e
9 changed files with 62559 additions and 0 deletions

View File

@ -0,0 +1,36 @@
# Enterprise Knowledge Retrieval
This repo is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
This repo contains a notebook and a basic Streamlit app:
- `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance
- `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
To run the app, please follow the instructions below in the ```App``` section
## Notebook
The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
- **Setup:** Initiate variables and connect to a vector database.
- **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
- **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
- **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
- **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
## App
We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or Hyde retrievals.
You can use it by:
- Ensuring you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
- Setting up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
- Activate the environment by running ```source venv/bin/activate```.
- Install requirements by running ```pip install -r requirements.txt```.
- Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser
## Limitations
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
- We introduce many areas you may optimize in the notebook, but we'll deep dive on these in separate offerings in the coming weeks.

View File

@ -0,0 +1,169 @@
from langchain.agents import (
Tool,
AgentExecutor,
LLMSingleActionAgent,
AgentOutputParser,
)
from langchain.prompts import BaseChatPromptTemplate
from langchain import SerpAPIWrapper, LLMChain
from langchain.chat_models import ChatOpenAI
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish, HumanMessage
from langchain.memory import ConversationBufferWindowMemory
import openai
import re
import streamlit as st
from database import get_redis_results, get_redis_connection
from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
redis_client = get_redis_connection()
def answer_user_question(query):
results = get_redis_results(redis_client, query, INDEX_NAME)
results.to_csv("results.csv")
search_content = ""
for x, y in results.head(3).iterrows():
search_content += y["title"] + "\n" + y["result"] + "\n\n"
retrieval_prepped = RETRIEVAL_PROMPT.format(
SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
)
retrieval = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[{"role": "user", "content": retrieval_prepped}],
max_tokens=500,
)
# Response provided by GPT-3.5
return retrieval["choices"][0]["message"]["content"]
def answer_question_hyde(query):
hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
User question: {USER_QUESTION_HERE}
Answer:"""
hypothetical_answer = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[
{
"role": "user",
"content": hyde_prompt.format(USER_QUESTION_HERE=query),
}
],
)["choices"][0]["message"]["content"]
# st.write(hypothetical_answer)
results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
results.to_csv("results.csv")
search_content = ""
for x, y in results.head(3).iterrows():
search_content += y["title"] + "\n" + y["result"] + "\n\n"
retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
"SEARCH_CONTENT_HERE", search_content
)
retrieval = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[{"role": "user", "content": retrieval_prepped}],
max_tokens=500,
)
return retrieval["choices"][0]["message"]["content"]
# Set up a prompt template
class CustomPromptTemplate(BaseChatPromptTemplate):
# The template to use
template: str
# The list of tools available
tools: List[Tool]
def format_messages(self, **kwargs) -> str:
# Get the intermediate steps (AgentAction, Observation tuples)
# Format them in a particular way
intermediate_steps = kwargs.pop("intermediate_steps")
thoughts = ""
for action, observation in intermediate_steps:
thoughts += action.log
thoughts += f"\nObservation: {observation}\nThought: "
# Set the agent_scratchpad variable to that value
kwargs["agent_scratchpad"] = thoughts
# Create a tools variable from the list of tools provided
kwargs["tools"] = "\n".join(
[f"{tool.name}: {tool.description}" for tool in self.tools]
)
# Create a list of tool names for the tools provided
kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
formatted = self.template.format(**kwargs)
return [HumanMessage(content=formatted)]
class CustomOutputParser(AgentOutputParser):
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
# Check if agent should finish
if "Final Answer:" in llm_output:
return AgentFinish(
# Return values is generally always a dictionary with a single `output` key
# It is not recommended to try anything else at the moment :)
return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
log=llm_output,
)
# Parse out the action and action input
regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
match = re.search(regex, llm_output, re.DOTALL)
if not match:
raise ValueError(f"Could not parse LLM output: `{llm_output}`")
action = match.group(1).strip()
action_input = match.group(2)
# Return the action and action input
return AgentAction(
tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
)
def initiate_agent(tools):
prompt = CustomPromptTemplate(
template=SYSTEM_PROMPT,
tools=tools,
# This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
# The history template includes "history" as an input variable so we can interpolate it into the prompt
input_variables=["input", "intermediate_steps", "history"],
)
# Initiate the memory with k=2 to keep the last two turns
# Provide the memory to the agent
memory = ConversationBufferWindowMemory(k=2)
output_parser = CustomOutputParser()
llm = ChatOpenAI(temperature=0)
# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)
tool_names = [tool.name for tool in tools]
agent = LLMSingleActionAgent(
llm_chain=llm_chain,
output_parser=output_parser,
stop=["\nObservation:"],
allowed_tools=tool_names,
)
agent_executor = AgentExecutor.from_agent_and_tools(
agent=agent, tools=tools, verbose=True, memory=memory
)
return agent_executor

View File

@ -0,0 +1,76 @@
from langchain.agents import Tool
import pandas as pd
import streamlit as st
from streamlit_chat import message
from database import get_redis_connection
from assistant import answer_user_question, initiate_agent, answer_question_hyde
# Initialise database
## Initialise Redis connection
redis_client = get_redis_connection()
### CHATBOT APP
# --- GENERAL SETTINGS ---
PAGE_TITLE: str = "Knowledge Retrieval Bot"
PAGE_ICON: str = "🤖"
st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
st.title("Wiki Chatbot")
st.subheader("Learn things - random things!")
# Using object notation
add_selectbox = st.sidebar.selectbox(
"What kind of search?", ("Standard vector search", "HyDE")
)
# Define which tools the agent can use to answer user queries
tools = [
Tool(
name="Search",
func=answer_user_question
if add_selectbox == "Standard vector search"
else answer_question_hyde,
description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
)
]
if "generated" not in st.session_state:
st.session_state["generated"] = []
if "past" not in st.session_state:
st.session_state["past"] = []
def query(question):
response = st.session_state["chat"].ask_assistant(question)
return response
prompt = st.text_input("What do you want to know: ", "", key="input")
if st.button("Submit", key="generationSubmit"):
with st.spinner("Thinking..."):
# Initialization
if "agent" not in st.session_state:
st.session_state["agent"] = initiate_agent(tools)
response = st.session_state["agent"].run(prompt)
st.session_state.past.append(prompt)
st.session_state.generated.append(response)
if len(st.session_state["generated"]) > 0:
for i in range(len(st.session_state["generated"]) - 1, -1, -1):
message(st.session_state["generated"][i], key=str(i))
message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
with st.expander("See search results"):
results = list(pd.read_csv("results.csv")["result"])
st.write(results)

View File

@ -0,0 +1,46 @@
REDIS_HOST = "localhost"
REDIS_PORT = "6380"
REDIS_DB = "0"
INDEX_NAME = "wiki-index"
VECTOR_FIELD_NAME = "content_vector"
CHAT_MODEL = "gpt-3.5-turbo"
EMBEDDINGS_MODEL = "text-embedding-ada-002"
# Set up the base template
SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
Accept the first answer that you are provided for the user.
You have access to the following tools::
{tools}
Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin! Remember to give detailed, informative answers
Previous conversation history:
{history}
New question: {input}
{agent_scratchpad}"""
# Build a prompt to provide the original query, the result and ask to summarise for the user
RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
Search query:
{SEARCH_QUERY_HERE}
Content:
{SEARCH_CONTENT_HERE}
Answer:
"""

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,72 @@
import ast
from math import isnan
import numpy as np
import pandas as pd
import openai
from redis import Redis as r
from redis.commands.search.query import Query
from config import (
REDIS_DB,
REDIS_HOST,
REDIS_PORT,
VECTOR_FIELD_NAME,
EMBEDDINGS_MODEL,
INDEX_NAME,
)
def get_redis_connection():
redis_client = r(
host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=False
)
return redis_client
# Make query to Redis
def query_redis(redis_conn, query, index_name, top_k=5):
## Creates embedding vector from user query
embedded_query = np.array(
openai.Embedding.create(input=query, model=EMBEDDINGS_MODEL,)["data"][
0
]["embedding"],
dtype=np.float32,
).tobytes()
# prepare the query
q = (
Query(f"*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]")
.sort_by("vector_score")
.paging(0, top_k)
.return_fields("vector_score", "url", "title", "content", "text_chunk_index")
.dialect(2)
)
params_dict = {"vec_param": embedded_query}
# Execute the query
results = redis_conn.ft(index_name).search(q, query_params=params_dict)
return results
# Get mapped documents from Redis results
def get_redis_results(redis_conn, query, index_name):
# Get most relevant documents from Redis
query_result = query_redis(redis_conn, query, index_name)
# Extract info into a list
query_result_list = []
for i, result in enumerate(query_result.docs):
result_order = i
url = result.url
title = result.title
text = result.content
score = result.vector_score
query_result_list.append((result_order, url, title, text, score))
# Display result as a DataFrame for ease of us
result_df = pd.DataFrame(query_result_list)
result_df.columns = ["id", "url", "title", "result", "certainty"]
return result_df

File diff suppressed because it is too large Load Diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

View File

@ -0,0 +1,7 @@
langchain==0.0.158
numpy==1.24.2
openai==0.27.4
pandas==2.0.0
redis==4.5.4
streamlit==1.22.0
streamlit_chat==0.0.2.2