mirror of
https://github.com/openai/openai-cookbook
synced 2024-11-08 01:10:29 +00:00
Pushing enterprise knowledge retrieval to cookbook
This commit is contained in:
parent
d3d8d9742a
commit
ab66238d5e
36
apps/enterprise-knowledge-retrieval/README.md
Normal file
36
apps/enterprise-knowledge-retrieval/README.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Enterprise Knowledge Retrieval
|
||||||
|
|
||||||
|
This repo is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
|
||||||
|
|
||||||
|
This repo contains a notebook and a basic Streamlit app:
|
||||||
|
- `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance
|
||||||
|
- `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
|
||||||
|
|
||||||
|
To run the app, please follow the instructions below in the ```App``` section
|
||||||
|
|
||||||
|
## Notebook
|
||||||
|
|
||||||
|
The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
|
||||||
|
- **Setup:** Initiate variables and connect to a vector database.
|
||||||
|
- **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
|
||||||
|
- **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
|
||||||
|
- **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
|
||||||
|
- **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
|
||||||
|
|
||||||
|
Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
|
||||||
|
|
||||||
|
## App
|
||||||
|
|
||||||
|
We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or Hyde retrievals.
|
||||||
|
|
||||||
|
You can use it by:
|
||||||
|
- Ensuring you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
|
||||||
|
- Setting up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
|
||||||
|
- Activate the environment by running ```source venv/bin/activate```.
|
||||||
|
- Install requirements by running ```pip install -r requirements.txt```.
|
||||||
|
- Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser
|
||||||
|
|
||||||
|
## Limitations
|
||||||
|
|
||||||
|
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
|
||||||
|
- We introduce many areas you may optimize in the notebook, but we'll deep dive on these in separate offerings in the coming weeks.
|
169
apps/enterprise-knowledge-retrieval/assistant.py
Normal file
169
apps/enterprise-knowledge-retrieval/assistant.py
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
from langchain.agents import (
|
||||||
|
Tool,
|
||||||
|
AgentExecutor,
|
||||||
|
LLMSingleActionAgent,
|
||||||
|
AgentOutputParser,
|
||||||
|
)
|
||||||
|
from langchain.prompts import BaseChatPromptTemplate
|
||||||
|
from langchain import SerpAPIWrapper, LLMChain
|
||||||
|
from langchain.chat_models import ChatOpenAI
|
||||||
|
from typing import List, Union
|
||||||
|
from langchain.schema import AgentAction, AgentFinish, HumanMessage
|
||||||
|
from langchain.memory import ConversationBufferWindowMemory
|
||||||
|
import openai
|
||||||
|
import re
|
||||||
|
import streamlit as st
|
||||||
|
|
||||||
|
from database import get_redis_results, get_redis_connection
|
||||||
|
from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
|
||||||
|
|
||||||
|
|
||||||
|
redis_client = get_redis_connection()
|
||||||
|
|
||||||
|
|
||||||
|
def answer_user_question(query):
|
||||||
|
|
||||||
|
results = get_redis_results(redis_client, query, INDEX_NAME)
|
||||||
|
|
||||||
|
results.to_csv("results.csv")
|
||||||
|
|
||||||
|
search_content = ""
|
||||||
|
for x, y in results.head(3).iterrows():
|
||||||
|
search_content += y["title"] + "\n" + y["result"] + "\n\n"
|
||||||
|
|
||||||
|
retrieval_prepped = RETRIEVAL_PROMPT.format(
|
||||||
|
SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
|
||||||
|
)
|
||||||
|
|
||||||
|
retrieval = openai.ChatCompletion.create(
|
||||||
|
model=CHAT_MODEL,
|
||||||
|
messages=[{"role": "user", "content": retrieval_prepped}],
|
||||||
|
max_tokens=500,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Response provided by GPT-3.5
|
||||||
|
return retrieval["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
|
def answer_question_hyde(query):
|
||||||
|
|
||||||
|
hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
|
||||||
|
Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
|
||||||
|
|
||||||
|
User question: {USER_QUESTION_HERE}
|
||||||
|
|
||||||
|
Answer:"""
|
||||||
|
|
||||||
|
hypothetical_answer = openai.ChatCompletion.create(
|
||||||
|
model=CHAT_MODEL,
|
||||||
|
messages=[
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": hyde_prompt.format(USER_QUESTION_HERE=query),
|
||||||
|
}
|
||||||
|
],
|
||||||
|
)["choices"][0]["message"]["content"]
|
||||||
|
# st.write(hypothetical_answer)
|
||||||
|
results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
|
||||||
|
|
||||||
|
results.to_csv("results.csv")
|
||||||
|
|
||||||
|
search_content = ""
|
||||||
|
for x, y in results.head(3).iterrows():
|
||||||
|
search_content += y["title"] + "\n" + y["result"] + "\n\n"
|
||||||
|
|
||||||
|
retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
|
||||||
|
"SEARCH_CONTENT_HERE", search_content
|
||||||
|
)
|
||||||
|
retrieval = openai.ChatCompletion.create(
|
||||||
|
model=CHAT_MODEL,
|
||||||
|
messages=[{"role": "user", "content": retrieval_prepped}],
|
||||||
|
max_tokens=500,
|
||||||
|
)
|
||||||
|
|
||||||
|
return retrieval["choices"][0]["message"]["content"]
|
||||||
|
|
||||||
|
|
||||||
|
# Set up a prompt template
|
||||||
|
class CustomPromptTemplate(BaseChatPromptTemplate):
|
||||||
|
# The template to use
|
||||||
|
template: str
|
||||||
|
# The list of tools available
|
||||||
|
tools: List[Tool]
|
||||||
|
|
||||||
|
def format_messages(self, **kwargs) -> str:
|
||||||
|
# Get the intermediate steps (AgentAction, Observation tuples)
|
||||||
|
# Format them in a particular way
|
||||||
|
intermediate_steps = kwargs.pop("intermediate_steps")
|
||||||
|
thoughts = ""
|
||||||
|
for action, observation in intermediate_steps:
|
||||||
|
thoughts += action.log
|
||||||
|
thoughts += f"\nObservation: {observation}\nThought: "
|
||||||
|
# Set the agent_scratchpad variable to that value
|
||||||
|
kwargs["agent_scratchpad"] = thoughts
|
||||||
|
# Create a tools variable from the list of tools provided
|
||||||
|
kwargs["tools"] = "\n".join(
|
||||||
|
[f"{tool.name}: {tool.description}" for tool in self.tools]
|
||||||
|
)
|
||||||
|
# Create a list of tool names for the tools provided
|
||||||
|
kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
|
||||||
|
formatted = self.template.format(**kwargs)
|
||||||
|
return [HumanMessage(content=formatted)]
|
||||||
|
|
||||||
|
|
||||||
|
class CustomOutputParser(AgentOutputParser):
|
||||||
|
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
|
||||||
|
# Check if agent should finish
|
||||||
|
if "Final Answer:" in llm_output:
|
||||||
|
return AgentFinish(
|
||||||
|
# Return values is generally always a dictionary with a single `output` key
|
||||||
|
# It is not recommended to try anything else at the moment :)
|
||||||
|
return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
|
||||||
|
log=llm_output,
|
||||||
|
)
|
||||||
|
# Parse out the action and action input
|
||||||
|
regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
|
||||||
|
match = re.search(regex, llm_output, re.DOTALL)
|
||||||
|
if not match:
|
||||||
|
raise ValueError(f"Could not parse LLM output: `{llm_output}`")
|
||||||
|
action = match.group(1).strip()
|
||||||
|
action_input = match.group(2)
|
||||||
|
# Return the action and action input
|
||||||
|
return AgentAction(
|
||||||
|
tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def initiate_agent(tools):
|
||||||
|
prompt = CustomPromptTemplate(
|
||||||
|
template=SYSTEM_PROMPT,
|
||||||
|
tools=tools,
|
||||||
|
# This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
|
||||||
|
# The history template includes "history" as an input variable so we can interpolate it into the prompt
|
||||||
|
input_variables=["input", "intermediate_steps", "history"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initiate the memory with k=2 to keep the last two turns
|
||||||
|
# Provide the memory to the agent
|
||||||
|
memory = ConversationBufferWindowMemory(k=2)
|
||||||
|
|
||||||
|
output_parser = CustomOutputParser()
|
||||||
|
|
||||||
|
llm = ChatOpenAI(temperature=0)
|
||||||
|
|
||||||
|
# LLM chain consisting of the LLM and a prompt
|
||||||
|
llm_chain = LLMChain(llm=llm, prompt=prompt)
|
||||||
|
|
||||||
|
tool_names = [tool.name for tool in tools]
|
||||||
|
agent = LLMSingleActionAgent(
|
||||||
|
llm_chain=llm_chain,
|
||||||
|
output_parser=output_parser,
|
||||||
|
stop=["\nObservation:"],
|
||||||
|
allowed_tools=tool_names,
|
||||||
|
)
|
||||||
|
|
||||||
|
agent_executor = AgentExecutor.from_agent_and_tools(
|
||||||
|
agent=agent, tools=tools, verbose=True, memory=memory
|
||||||
|
)
|
||||||
|
|
||||||
|
return agent_executor
|
76
apps/enterprise-knowledge-retrieval/chatbot.py
Normal file
76
apps/enterprise-knowledge-retrieval/chatbot.py
Normal file
@ -0,0 +1,76 @@
|
|||||||
|
from langchain.agents import Tool
|
||||||
|
import pandas as pd
|
||||||
|
import streamlit as st
|
||||||
|
from streamlit_chat import message
|
||||||
|
|
||||||
|
from database import get_redis_connection
|
||||||
|
from assistant import answer_user_question, initiate_agent, answer_question_hyde
|
||||||
|
|
||||||
|
# Initialise database
|
||||||
|
|
||||||
|
## Initialise Redis connection
|
||||||
|
redis_client = get_redis_connection()
|
||||||
|
|
||||||
|
|
||||||
|
### CHATBOT APP
|
||||||
|
|
||||||
|
# --- GENERAL SETTINGS ---
|
||||||
|
PAGE_TITLE: str = "Knowledge Retrieval Bot"
|
||||||
|
PAGE_ICON: str = "🤖"
|
||||||
|
|
||||||
|
st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
|
||||||
|
|
||||||
|
st.title("Wiki Chatbot")
|
||||||
|
st.subheader("Learn things - random things!")
|
||||||
|
|
||||||
|
# Using object notation
|
||||||
|
add_selectbox = st.sidebar.selectbox(
|
||||||
|
"What kind of search?", ("Standard vector search", "HyDE")
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define which tools the agent can use to answer user queries
|
||||||
|
tools = [
|
||||||
|
Tool(
|
||||||
|
name="Search",
|
||||||
|
func=answer_user_question
|
||||||
|
if add_selectbox == "Standard vector search"
|
||||||
|
else answer_question_hyde,
|
||||||
|
description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
if "generated" not in st.session_state:
|
||||||
|
st.session_state["generated"] = []
|
||||||
|
|
||||||
|
if "past" not in st.session_state:
|
||||||
|
st.session_state["past"] = []
|
||||||
|
|
||||||
|
|
||||||
|
def query(question):
|
||||||
|
response = st.session_state["chat"].ask_assistant(question)
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
prompt = st.text_input("What do you want to know: ", "", key="input")
|
||||||
|
|
||||||
|
if st.button("Submit", key="generationSubmit"):
|
||||||
|
with st.spinner("Thinking..."):
|
||||||
|
# Initialization
|
||||||
|
if "agent" not in st.session_state:
|
||||||
|
st.session_state["agent"] = initiate_agent(tools)
|
||||||
|
|
||||||
|
response = st.session_state["agent"].run(prompt)
|
||||||
|
|
||||||
|
st.session_state.past.append(prompt)
|
||||||
|
st.session_state.generated.append(response)
|
||||||
|
|
||||||
|
if len(st.session_state["generated"]) > 0:
|
||||||
|
for i in range(len(st.session_state["generated"]) - 1, -1, -1):
|
||||||
|
message(st.session_state["generated"][i], key=str(i))
|
||||||
|
message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
|
||||||
|
|
||||||
|
with st.expander("See search results"):
|
||||||
|
|
||||||
|
results = list(pd.read_csv("results.csv")["result"])
|
||||||
|
|
||||||
|
st.write(results)
|
46
apps/enterprise-knowledge-retrieval/config.py
Normal file
46
apps/enterprise-knowledge-retrieval/config.py
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
REDIS_HOST = "localhost"
|
||||||
|
REDIS_PORT = "6380"
|
||||||
|
REDIS_DB = "0"
|
||||||
|
INDEX_NAME = "wiki-index"
|
||||||
|
VECTOR_FIELD_NAME = "content_vector"
|
||||||
|
CHAT_MODEL = "gpt-3.5-turbo"
|
||||||
|
EMBEDDINGS_MODEL = "text-embedding-ada-002"
|
||||||
|
# Set up the base template
|
||||||
|
SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
|
||||||
|
Accept the first answer that you are provided for the user.
|
||||||
|
You have access to the following tools::
|
||||||
|
|
||||||
|
{tools}
|
||||||
|
|
||||||
|
Use the following format:
|
||||||
|
|
||||||
|
Question: the input question you must answer
|
||||||
|
Thought: you should always think about what to do
|
||||||
|
Action: the action to take, should be one of [{tool_names}]
|
||||||
|
Action Input: the input to the action
|
||||||
|
Observation: the result of the action
|
||||||
|
... (this Thought/Action/Action Input/Observation can repeat N times)
|
||||||
|
Thought: I now know the final answer
|
||||||
|
Final Answer: the final answer to the original input question
|
||||||
|
|
||||||
|
Begin! Remember to give detailed, informative answers
|
||||||
|
|
||||||
|
Previous conversation history:
|
||||||
|
{history}
|
||||||
|
|
||||||
|
New question: {input}
|
||||||
|
{agent_scratchpad}"""
|
||||||
|
# Build a prompt to provide the original query, the result and ask to summarise for the user
|
||||||
|
RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
|
||||||
|
If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
|
||||||
|
|
||||||
|
Search query:
|
||||||
|
|
||||||
|
{SEARCH_QUERY_HERE}
|
||||||
|
|
||||||
|
Content:
|
||||||
|
|
||||||
|
{SEARCH_CONTENT_HERE}
|
||||||
|
|
||||||
|
Answer:
|
||||||
|
"""
|
60403
apps/enterprise-knowledge-retrieval/data/wikipedia_articles_2000.csv
Normal file
60403
apps/enterprise-knowledge-retrieval/data/wikipedia_articles_2000.csv
Normal file
File diff suppressed because it is too large
Load Diff
72
apps/enterprise-knowledge-retrieval/database.py
Normal file
72
apps/enterprise-knowledge-retrieval/database.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import ast
|
||||||
|
from math import isnan
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import openai
|
||||||
|
from redis import Redis as r
|
||||||
|
from redis.commands.search.query import Query
|
||||||
|
|
||||||
|
from config import (
|
||||||
|
REDIS_DB,
|
||||||
|
REDIS_HOST,
|
||||||
|
REDIS_PORT,
|
||||||
|
VECTOR_FIELD_NAME,
|
||||||
|
EMBEDDINGS_MODEL,
|
||||||
|
INDEX_NAME,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_redis_connection():
|
||||||
|
redis_client = r(
|
||||||
|
host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=False
|
||||||
|
)
|
||||||
|
return redis_client
|
||||||
|
|
||||||
|
|
||||||
|
# Make query to Redis
|
||||||
|
def query_redis(redis_conn, query, index_name, top_k=5):
|
||||||
|
|
||||||
|
## Creates embedding vector from user query
|
||||||
|
embedded_query = np.array(
|
||||||
|
openai.Embedding.create(input=query, model=EMBEDDINGS_MODEL,)["data"][
|
||||||
|
0
|
||||||
|
]["embedding"],
|
||||||
|
dtype=np.float32,
|
||||||
|
).tobytes()
|
||||||
|
|
||||||
|
# prepare the query
|
||||||
|
q = (
|
||||||
|
Query(f"*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]")
|
||||||
|
.sort_by("vector_score")
|
||||||
|
.paging(0, top_k)
|
||||||
|
.return_fields("vector_score", "url", "title", "content", "text_chunk_index")
|
||||||
|
.dialect(2)
|
||||||
|
)
|
||||||
|
params_dict = {"vec_param": embedded_query}
|
||||||
|
|
||||||
|
# Execute the query
|
||||||
|
results = redis_conn.ft(index_name).search(q, query_params=params_dict)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# Get mapped documents from Redis results
|
||||||
|
def get_redis_results(redis_conn, query, index_name):
|
||||||
|
|
||||||
|
# Get most relevant documents from Redis
|
||||||
|
query_result = query_redis(redis_conn, query, index_name)
|
||||||
|
|
||||||
|
# Extract info into a list
|
||||||
|
query_result_list = []
|
||||||
|
for i, result in enumerate(query_result.docs):
|
||||||
|
result_order = i
|
||||||
|
url = result.url
|
||||||
|
title = result.title
|
||||||
|
text = result.content
|
||||||
|
score = result.vector_score
|
||||||
|
query_result_list.append((result_order, url, title, text, score))
|
||||||
|
|
||||||
|
# Display result as a DataFrame for ease of us
|
||||||
|
result_df = pd.DataFrame(query_result_list)
|
||||||
|
result_df.columns = ["id", "url", "title", "result", "certainty"]
|
||||||
|
return result_df
|
File diff suppressed because it is too large
Load Diff
Binary file not shown.
After Width: | Height: | Size: 72 KiB |
7
apps/enterprise-knowledge-retrieval/requirements.txt
Normal file
7
apps/enterprise-knowledge-retrieval/requirements.txt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
langchain==0.0.158
|
||||||
|
numpy==1.24.2
|
||||||
|
openai==0.27.4
|
||||||
|
pandas==2.0.0
|
||||||
|
redis==4.5.4
|
||||||
|
streamlit==1.22.0
|
||||||
|
streamlit_chat==0.0.2.2
|
Loading…
Reference in New Issue
Block a user