Initial commit of simple app demonstrating chatbot starting point in notebook and Streamlit

pull/206/head
colin-openai 1 year ago
parent c3b920f123
commit 771f108f77

@ -0,0 +1,35 @@
# Powering your products with ChatGPT and your own data
The Chatbot Kickstarter is a starter repo to get you used to building basic a basic Chatbot using the ChatGPT API and your own knowledge base. The flow you're taken through was originally presented with [these slides](https://drive.google.com/file/d/1dB-RQhZC_Q1iAsHkNNdkqtxxXqYODFYy/view?usp=share_link), which may come in useful to refer to.
This repo contains one notebook and two basic Streamlit apps:
- `powering_your_products_with_chatgpt_and_your_data.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, and building simple Q&A and Chatbot functionality on top.
- `search.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
- `chat.py`: A Streamlit app providing a simple Chatbot via a search bar to query your knowledge base.
To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
## How it works
The notebook is the best place to start, and is broadly laid out as follows:
- **Lay the foundations:**
- Set up the vector database to accept vectors and data
- Load the dataset, chunk the data up for embedding and store in the vector database
- **Make it a product:**
- Add a retrieval step where users provide queries and we return the most relevant entries
- Summarise search results with GPT-3
- Test out this basic Q&A app in Streamlit
- **Build your moat:**
- Create an Assistant class to manage context and interact with our bot
- Use the Chatbot to answer questions using semantic search context
- Test out this basic Chatbot app in Streamlit
Once you've run the notebook and tried the two Streamlit apps, you should be in a position to strip out any useful snippets and start your own Q&A or Chat application.
## Limitations
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
- This is a simple starting point - if you hit issues deploying your use case you may need to tune (non-exhaustive list):
- The prompt and parameters for the model for it to answer accurately
- Your search to return more relevant results
- Your chunking/embedding approach to store the most relevant content effectively for retrieval

@ -0,0 +1,83 @@
import streamlit as st
from streamlit_chat import message
from database import get_redis_connection
from chatbot import RetrievalAssistant, Message
# Initialise database
## Initialise Redis connection
redis_client = get_redis_connection()
# Set instruction
# System prompt requiring Question and Year to be extracted from the user
system_prompt = '''
You are a helpful Formula 1 knowledge base assistant. You need to capture a Question and Year from each customer.
The Question is their query on Formula 1, and the Year is the year of the applicable Formula 1 season.
Think about this step by step:
- The user will ask a Question
- You will ask them for the Year if their question didn't include a Year
- Once you have the Year, say "searching for answers".
Example:
User: I'd like to know the cost cap for a power unit
Assistant: Certainly, what year would you like this for?
User: 2023 please.
Assistant: Searching for answers.
'''
### CHATBOT APP
st.set_page_config(
page_title="Streamlit Chat - Demo",
page_icon=":robot:"
)
st.title('Formula 1 Chatbot')
st.subheader("Help us help you learn about Formula 1")
if 'generated' not in st.session_state:
st.session_state['generated'] = []
if 'past' not in st.session_state:
st.session_state['past'] = []
def query(question):
response = st.session_state['chat'].ask_assistant(question)
return response
prompt = st.text_input("What do you want to know: ","", key="input")
if st.button('Submit', key='generationSubmit'):
# Initialization
if 'chat' not in st.session_state:
st.session_state['chat'] = RetrievalAssistant()
messages = []
system_message = Message('system',system_prompt)
messages.append(system_message.message())
else:
messages = []
user_message = Message('user',prompt)
messages.append(user_message.message())
response = query(messages)
# Debugging step to print the whole response
#st.write(response)
st.session_state.past.append(prompt)
st.session_state.generated.append(response['content'])
if st.session_state['generated']:
for i in range(len(st.session_state['generated'])-1, -1, -1):
message(st.session_state["generated"][i], key=str(i))
message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')

@ -0,0 +1,84 @@
import openai
from termcolor import colored
import streamlit as st
from database import get_redis_connection,get_redis_results
from config import CHAT_MODEL,COMPLETIONS_MODEL, INDEX_NAME
redis_client = get_redis_connection()
# A basic class to create a message as a dict for chat
class Message:
def __init__(self,role,content):
self.role = role
self.content = content
def message(self):
return {"role": self.role,"content": self.content}
# New Assistant class to add a vector database call to its responses
class RetrievalAssistant:
def __init__(self):
self.conversation_history = []
def _get_assistant_response(self, prompt):
try:
completion = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=prompt,
temperature=0.1
)
response_message = Message(completion['choices'][0]['message']['role'],completion['choices'][0]['message']['content'])
return response_message.message()
except Exception as e:
return f'Request failed with exception {e}'
# The function to retrieve Redis search results
def _get_search_results(self,prompt):
latest_question = prompt
search_content = get_redis_results(redis_client,latest_question,INDEX_NAME)['result'][0]
return search_content
def ask_assistant(self, next_user_prompt):
[self.conversation_history.append(x) for x in next_user_prompt]
assistant_response = self._get_assistant_response(self.conversation_history)
# Answer normally unless the trigger sequence is used "searching_for_answers"
if 'searching for answers' in assistant_response['content'].lower():
question_extract = openai.Completion.create(model=COMPLETIONS_MODEL,prompt=f"Extract the user's latest question and the year for that question from this conversation: {self.conversation_history}. Extract it as a sentence stating the Question and Year")
search_result = self._get_search_results(question_extract['choices'][0]['text'])
# We insert an extra system prompt here to give fresh context to the Chatbot on how to use the Redis results
# In this instance we add it to the conversation history, but in production it may be better to hide
self.conversation_history.insert(-1,{"role": 'system',"content": f"Answer the user's question using this content: {search_result}. If you cannot answer the question, say 'Sorry, I don't know the answer to this one'"})
assistant_response = self._get_assistant_response(self.conversation_history)
self.conversation_history.append(assistant_response)
return assistant_response
else:
self.conversation_history.append(assistant_response)
return assistant_response
def pretty_print_conversation_history(self, colorize_assistant_replies=True):
for entry in self.conversation_history:
if entry['role'] == 'system':
pass
else:
prefix = entry['role']
content = entry['content']
output = colored(prefix +':\n' + content, 'green') if colorize_assistant_replies and entry['role'] == 'assistant' else prefix +':\n' + content
#prefix = entry['role']
print(output)

@ -0,0 +1,7 @@
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDINGS_MODEL = "text-embedding-ada-002"
CHAT_MODEL = 'gpt-3.5-turbo'
TEXT_EMBEDDING_CHUNK_SIZE=300
VECTOR_FIELD_NAME='content_vector'
PREFIX = "sportsdoc"
INDEX_NAME = "f1-index"

@ -0,0 +1,82 @@
import pandas as pd
import numpy as np
import openai
from redis import Redis
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField, NumericField
from redis.commands.search.query import Query
from config import EMBEDDINGS_MODEL, PREFIX, VECTOR_FIELD_NAME
# Get a Redis connection
def get_redis_connection(host='localhost',port='6379',db=0):
r = Redis(host=host, port=port, db=db,decode_responses=False)
return r
# Create a Redis index to hold our data
def create_hnsw_index (redis_conn,vector_field_name,vector_dimensions=1536, distance_metric='COSINE'):
redis_conn.ft().create_index([
VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric}),
TextField("filename"),
TextField("text_chunk"),
NumericField("file_chunk_index")
])
# Create a Redis pipeline to load all the vectors and their metadata
def load_vectors(client:Redis, input_list, vector_field_name):
p = client.pipeline(transaction=False)
for text in input_list:
#hash key
key=f"{PREFIX}:{text['id']}"
#hash values
item_metadata = text['metadata']
#
item_keywords_vector = np.array(text['vector'],dtype= 'float32').tobytes()
item_metadata[vector_field_name]=item_keywords_vector
# HSET
p.hset(key,mapping=item_metadata)
p.execute()
# Make query to Redis
def query_redis(redis_conn,query,index_name, top_k=2):
## Creates embedding vector from user query
embedded_query = np.array(openai.Embedding.create(
input=query,
model=EMBEDDINGS_MODEL,
)["data"][0]['embedding'], dtype=np.float32).tobytes()
#prepare the query
q = Query(f'*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]').sort_by('vector_score').paging(0,top_k).return_fields('vector_score','filename','text_chunk','text_chunk_index').dialect(2)
params_dict = {"vec_param": embedded_query}
#Execute the query
results = redis_conn.ft(index_name).search(q, query_params = params_dict)
return results
# Get mapped documents from Weaviate results
def get_redis_results(redis_conn,query,index_name):
# Get most relevant documents from Redis
query_result = query_redis(redis_conn,query,index_name)
# Extract info into a list
query_result_list = []
for i, result in enumerate(query_result.docs):
result_order = i
text = result.text_chunk
score = result.vector_score
query_result_list.append((result_order,text,score))
# Display result as a DataFrame for ease of us
result_df = pd.DataFrame(query_result_list)
result_df.columns = ['id','result','certainty']
return result_df

@ -0,0 +1,11 @@
numpy==1.24.2
openai==0.27.1
pandas==1.5.3
redis==4.5.1
requests==2.28.2
streamlit==1.20.0
streamlit_chat==0.0.2.2
termcolor==2.2.0
jupyter
ipykernel
textract

@ -0,0 +1,39 @@
import streamlit as st
import openai
from database import get_redis_connection, get_redis_results
from config import INDEX_NAME, COMPLETIONS_MODEL
# initialise Redis connection
client = get_redis_connection()
### SEARCH APP
st.set_page_config(
page_title="Streamlit Search - Demo",
page_icon=":robot:"
)
st.title('Formula 1 Search')
st.subheader("Search for any Formula 1 rule questions you have")
prompt = st.text_input("Enter your search here","", key="input")
if st.button('Submit', key='generationSubmit'):
result_df = get_redis_results(client,prompt,INDEX_NAME)
# Build a prompt to provide the original query, the result and ask to summarise for the user
summary_prompt = '''Summarise this result in a bulleted list to answer the search query a customer has sent.
Search query: SEARCH_QUERY_HERE
Search result: SEARCH_RESULT_HERE
Summary:
'''
summary_prepped = summary_prompt.replace('SEARCH_QUERY_HERE',prompt).replace('SEARCH_RESULT_HERE',result_df['result'][0])
summary = openai.Completion.create(engine=COMPLETIONS_MODEL,prompt=summary_prepped,max_tokens=500)
# Response provided by GPT-3
st.write(summary['choices'][0]['text'])
# Option to display raw table instead of summary from GPT-3
#st.table(result_df)

@ -0,0 +1,116 @@
from typing import Iterator
from numpy import array, average
import openai
import pandas as pd
import numpy as np
from config import TEXT_EMBEDDING_CHUNK_SIZE, EMBEDDINGS_MODEL
from database import load_vectors
def get_col_average_from_list_of_lists(list_of_lists):
"""Return the average of each column in a list of lists."""
if len(list_of_lists) == 1:
return list_of_lists[0]
else:
list_of_lists_array = array(list_of_lists)
average_embedding = average(list_of_lists_array, axis=0)
return average_embedding.tolist()
# Create embeddings for a text using a tokenizer and an OpenAI engine
def create_embeddings_for_text(text, tokenizer):
"""Return a list of tuples (text_chunk, embedding) and an average embedding for a text."""
token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE, tokenizer))
text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]
embeddings_response = get_embeddings(text_chunks, EMBEDDINGS_MODEL)
embeddings = [embedding["embedding"] for embedding in embeddings_response]
text_embeddings = list(zip(text_chunks, embeddings))
average_embedding = get_col_average_from_list_of_lists(embeddings)
return (text_embeddings, average_embedding)
def get_embeddings(text_array, engine):
return openai.Engine(id=engine).embeddings(input=text_array)["data"]
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def chunks(text, n, tokenizer):
tokens = tokenizer.encode(text)
"""Yield successive n-sized chunks from text."""
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j
def get_unique_id_for_file_chunk(filename, chunk_index):
return str(filename+"-!"+str(chunk_index))
def handle_file_string(file,tokenizer,redis_conn, text_embedding_field,index_name):
filename = file[0]
file_body_string = file[1]
# Clean up the file string by replacing newlines and double spaces and semi-colons
clean_file_body_string = file_body_string.replace(" ", " ").replace("\n", "; ").replace(';',' ')
#
# Add the filename to the text to embed
text_to_embed = "Filename is: {}; {}".format(
filename, clean_file_body_string)
# Create embeddings for the text
try:
text_embeddings, average_embedding = create_embeddings_for_text(
text_to_embed, tokenizer)
#print("[handle_file_string] Created embedding for {}".format(filename))
except Exception as e:
print("[handle_file_string] Error creating embedding: {}".format(e))
# Get the vectors array of triples: file_chunk_id, embedding, metadata for each embedding
# Metadata is a dict with keys: filename, file_chunk_index
vectors = []
for i, (text_chunk, embedding) in enumerate(text_embeddings):
id = get_unique_id_for_file_chunk(filename, i)
vectors.append(({'id': id
, "vector": embedding, 'metadata': {"filename": filename
, "text_chunk": text_chunk
, "file_chunk_index": i}}))
try:
load_vectors(redis_conn, vectors,text_embedding_field)
except Exception as e:
print(f'Ran into a problem uploading to Redis: {e}')
# Make a class to generate batches for insertion
class BatchGenerator:
def __init__(self, batch_size: int = 10) -> None:
self.batch_size = batch_size
# Makes chunks out of an input DataFrame
def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
splits = self.splits_num(df.shape[0])
if splits <= 1:
yield df
else:
for chunk in np.array_split(df, splits):
yield chunk
# Determines how many chunks DataFrame contains
def splits_num(self, elements: int) -> int:
return round(elements / self.batch_size)
__call__ = to_batches
Loading…
Cancel
Save