Initial commit of simple app demonstrating chatbot starting point in notebook and Streamlit

1 year ago · 771f108f77
parent c3b920f123
commit 771f108f77
14 changed files with 1500 additions and 0 deletions
--- a/apps/chatbot-kickstarter/README.md
+++ b/apps/chatbot-kickstarter/README.md
@ -0,0 +1,35 @@
+# Powering your products with ChatGPT and your own data
+
+The Chatbot Kickstarter is a starter repo to get you used to building basic a basic Chatbot using the ChatGPT API and your own knowledge base. The flow you're taken through was originally presented with [these slides](https://drive.google.com/file/d/1dB-RQhZC_Q1iAsHkNNdkqtxxXqYODFYy/view?usp=share_link), which may come in useful to refer to. 
+
+This repo contains one notebook and two basic Streamlit apps:
+- `powering_your_products_with_chatgpt_and_your_data.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, and building simple Q&A and Chatbot functionality on top.
+- `search.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
+- `chat.py`: A Streamlit app providing a simple Chatbot via a search bar to query your knowledge base.
+
+To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
+
+## How it works
+
+The notebook is the best place to start, and is broadly laid out as follows:
+- **Lay the foundations:**
+    - Set up the vector database to accept vectors and data
+    - Load the dataset, chunk the data up for embedding and store in the vector database
+- **Make it a product:**
+    - Add a retrieval step where users provide queries and we return the most relevant entries
+    - Summarise search results with GPT-3
+    - Test out this basic Q&A app in Streamlit
+- **Build your moat:**
+    - Create an Assistant class to manage context and interact with our bot
+    - Use the Chatbot to answer questions using semantic search context
+    - Test out this basic Chatbot app in Streamlit
+
+Once you've run the notebook and tried the two Streamlit apps, you should be in a position to strip out any useful snippets and start your own Q&A or Chat application.
+
+## Limitations
+
+- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
+- This is a simple starting point - if you hit issues deploying your use case you may need to tune (non-exhaustive list):
+    - The prompt and parameters for the model for it to answer accurately
+    - Your search to return more relevant results
+    - Your chunking/embedding approach to store the most relevant content effectively for retrieval
--- a/apps/chatbot-kickstarter/chat.py
+++ b/apps/chatbot-kickstarter/chat.py
@ -0,0 +1,83 @@
+import streamlit as st
+from streamlit_chat import message
+
+from database import get_redis_connection
+from chatbot import RetrievalAssistant, Message
+
+# Initialise database
+
+## Initialise Redis connection
+redis_client = get_redis_connection()
+
+# Set instruction
+
+# System prompt requiring Question and Year to be extracted from the user
+system_prompt = '''
+You are a helpful Formula 1 knowledge base assistant. You need to capture a Question and Year from each customer.
+The Question is their query on Formula 1, and the Year is the year of the applicable Formula 1 season.
+Think about this step by step:
+- The user will ask a Question
+- You will ask them for the Year if their question didn't include a Year
+- Once you have the Year, say "searching for answers".
+
+Example:
+
+User: I'd like to know the cost cap for a power unit
+
+Assistant: Certainly, what year would you like this for?
+
+User: 2023 please.
+
+Assistant: Searching for answers.
+'''
+
+### CHATBOT APP
+
+st.set_page_config(
+    page_title="Streamlit Chat - Demo",
+    page_icon=":robot:"
+)
+
+st.title('Formula 1 Chatbot')
+st.subheader("Help us help you learn about Formula 1")
+
+if 'generated' not in st.session_state:
+    st.session_state['generated'] = []
+
+if 'past' not in st.session_state:
+    st.session_state['past'] = []
+
+def query(question):
+    response = st.session_state['chat'].ask_assistant(question)
+    return response
+
+prompt = st.text_input("What do you want to know: ","", key="input")
+
+if st.button('Submit', key='generationSubmit'):
+
+    # Initialization
+    if 'chat' not in st.session_state:
+        st.session_state['chat'] = RetrievalAssistant()
+        messages = []
+        system_message = Message('system',system_prompt)
+        messages.append(system_message.message())
+    else:
+        messages = []
+
+
+    user_message = Message('user',prompt)
+    messages.append(user_message.message())
+
+    response = query(messages)
+
+    # Debugging step to print the whole response
+    #st.write(response)
+
+    st.session_state.past.append(prompt)
+    st.session_state.generated.append(response['content'])
+
+if st.session_state['generated']:
+
+    for i in range(len(st.session_state['generated'])-1, -1, -1):
+        message(st.session_state["generated"][i], key=str(i))
+        message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')
--- a/apps/chatbot-kickstarter/chatbot.py
+++ b/apps/chatbot-kickstarter/chatbot.py
@ -0,0 +1,84 @@
+import openai
+from termcolor import colored
+import streamlit as st
+
+from database import get_redis_connection,get_redis_results
+
+from config import CHAT_MODEL,COMPLETIONS_MODEL, INDEX_NAME
+
+redis_client = get_redis_connection()
+
+# A basic class to create a message as a dict for chat
+class Message:
+    
+    
+    def __init__(self,role,content):
+        
+        self.role = role
+        self.content = content
+        
+    def message(self):
+        
+        return {"role": self.role,"content": self.content}
+
+# New Assistant class to add a vector database call to its responses
+class RetrievalAssistant:
+    
+    def __init__(self):
+        self.conversation_history = []  
+
+    def _get_assistant_response(self, prompt):
+        
+        try:
+            completion = openai.ChatCompletion.create(
+              model=CHAT_MODEL,
+              messages=prompt,
+              temperature=0.1
+            )
+            
+            response_message = Message(completion['choices'][0]['message']['role'],completion['choices'][0]['message']['content'])
+            return response_message.message()
+            
+        except Exception as e:
+            
+            return f'Request failed with exception {e}'
+    
+    # The function to retrieve Redis search results
+    def _get_search_results(self,prompt):
+        latest_question = prompt
+        search_content = get_redis_results(redis_client,latest_question,INDEX_NAME)['result'][0]
+        return search_content
+        
+
+    def ask_assistant(self, next_user_prompt):
+        [self.conversation_history.append(x) for x in next_user_prompt]
+        assistant_response = self._get_assistant_response(self.conversation_history)
+        
+        # Answer normally unless the trigger sequence is used "searching_for_answers"
+        if 'searching for answers' in assistant_response['content'].lower():
+            question_extract = openai.Completion.create(model=COMPLETIONS_MODEL,prompt=f"Extract the user's latest question and the year for that question from this conversation: {self.conversation_history}. Extract it as a sentence stating the Question and Year")
+            search_result = self._get_search_results(question_extract['choices'][0]['text'])
+            
+            # We insert an extra system prompt here to give fresh context to the Chatbot on how to use the Redis results
+            # In this instance we add it to the conversation history, but in production it may be better to hide
+            self.conversation_history.insert(-1,{"role": 'system',"content": f"Answer the user's question using this content: {search_result}. If you cannot answer the question, say 'Sorry, I don't know the answer to this one'"})
+            
+            assistant_response = self._get_assistant_response(self.conversation_history)
+            
+            self.conversation_history.append(assistant_response)
+            return assistant_response
+        else:
+            self.conversation_history.append(assistant_response)
+            return assistant_response
+            
+        
+    def pretty_print_conversation_history(self, colorize_assistant_replies=True):
+        for entry in self.conversation_history:
+            if entry['role'] == 'system':
+                pass
+            else:
+                prefix = entry['role']
+                content = entry['content']
+                output = colored(prefix +':\n' + content, 'green') if colorize_assistant_replies and entry['role'] == 'assistant' else prefix +':\n' + content
+                #prefix = entry['role']
+                print(output)
--- a/apps/chatbot-kickstarter/config.py
+++ b/apps/chatbot-kickstarter/config.py
@ -0,0 +1,7 @@
+COMPLETIONS_MODEL = "text-davinci-003"
+EMBEDDINGS_MODEL = "text-embedding-ada-002"
+CHAT_MODEL = 'gpt-3.5-turbo'
+TEXT_EMBEDDING_CHUNK_SIZE=300
+VECTOR_FIELD_NAME='content_vector'
+PREFIX = "sportsdoc"  
+INDEX_NAME = "f1-index"
--- a/apps/chatbot-kickstarter/data/FIA
+++ b/apps/chatbot-kickstarter/data/FIA
--- a/apps/chatbot-kickstarter/data/fia_2022_formula_1_sporting_regulations_-_issue_9_-_2022-10-19_0.pdf
+++ b/apps/chatbot-kickstarter/data/fia_2022_formula_1_sporting_regulations_-_issue_9_-_2022-10-19_0.pdf
--- a/apps/chatbot-kickstarter/data/fia_2023_formula_1_technical_regulations_-_issue_4_-_2022-12-07.pdf
+++ b/apps/chatbot-kickstarter/data/fia_2023_formula_1_technical_regulations_-_issue_4_-_2022-12-07.pdf
--- a/apps/chatbot-kickstarter/data/fia_f1_power_unit_financial_regulations_issue_1_-_2022-08-16.pdf
+++ b/apps/chatbot-kickstarter/data/fia_f1_power_unit_financial_regulations_issue_1_-_2022-08-16.pdf
--- a/apps/chatbot-kickstarter/data/fia_formula_1_financial_regulations_iss.13.pdf
+++ b/apps/chatbot-kickstarter/data/fia_formula_1_financial_regulations_iss.13.pdf
--- a/apps/chatbot-kickstarter/database.py
+++ b/apps/chatbot-kickstarter/database.py
@ -0,0 +1,82 @@
+import pandas as pd 
+import numpy as np
+import openai
+from redis import Redis
+from redis.commands.search.field import VectorField
+from redis.commands.search.field import TextField, NumericField
+from redis.commands.search.query import Query
+
+from config import EMBEDDINGS_MODEL, PREFIX, VECTOR_FIELD_NAME
+
+# Get a Redis connection
+def get_redis_connection(host='localhost',port='6379',db=0):
+    
+    r = Redis(host=host, port=port, db=db,decode_responses=False)
+    return r
+
+# Create a Redis index to hold our data
+def create_hnsw_index (redis_conn,vector_field_name,vector_dimensions=1536, distance_metric='COSINE'):
+    redis_conn.ft().create_index([
+        VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric}),
+        TextField("filename"),
+        TextField("text_chunk"),        
+        NumericField("file_chunk_index")
+    ])
+
+# Create a Redis pipeline to load all the vectors and their metadata
+def load_vectors(client:Redis, input_list, vector_field_name):
+    p = client.pipeline(transaction=False)
+    for text in input_list:    
+        #hash key
+        key=f"{PREFIX}:{text['id']}"
+        
+        #hash values
+        item_metadata = text['metadata']
+        #
+        item_keywords_vector = np.array(text['vector'],dtype= 'float32').tobytes()
+        item_metadata[vector_field_name]=item_keywords_vector
+        
+        # HSET
+        p.hset(key,mapping=item_metadata)
+            
+    p.execute()
+
+# Make query to Redis
+def query_redis(redis_conn,query,index_name, top_k=2):
+    
+    
+
+    ## Creates embedding vector from user query
+    embedded_query = np.array(openai.Embedding.create(
+                                                input=query,
+                                                model=EMBEDDINGS_MODEL,
+                                            )["data"][0]['embedding'], dtype=np.float32).tobytes()
+
+    #prepare the query
+    q = Query(f'*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]').sort_by('vector_score').paging(0,top_k).return_fields('vector_score','filename','text_chunk','text_chunk_index').dialect(2) 
+    params_dict = {"vec_param": embedded_query}
+
+    
+    #Execute the query
+    results = redis_conn.ft(index_name).search(q, query_params = params_dict)
+    
+    return results
+
+# Get mapped documents from Weaviate results
+def get_redis_results(redis_conn,query,index_name):
+    
+    # Get most relevant documents from Redis
+    query_result = query_redis(redis_conn,query,index_name)
+    
+    # Extract info into a list
+    query_result_list = []
+    for i, result in enumerate(query_result.docs):
+        result_order = i
+        text = result.text_chunk
+        score = result.vector_score
+        query_result_list.append((result_order,text,score))
+        
+    # Display result as a DataFrame for ease of us
+    result_df = pd.DataFrame(query_result_list)
+    result_df.columns = ['id','result','certainty']
+    return result_df
--- a/apps/chatbot-kickstarter/powering_your_products_with_chatgpt_and_your_data.ipynb
+++ b/apps/chatbot-kickstarter/powering_your_products_with_chatgpt_and_your_data.ipynb
--- a/apps/chatbot-kickstarter/requirements.txt
+++ b/apps/chatbot-kickstarter/requirements.txt
@ -0,0 +1,11 @@
+numpy==1.24.2
+openai==0.27.1
+pandas==1.5.3
+redis==4.5.1
+requests==2.28.2
+streamlit==1.20.0
+streamlit_chat==0.0.2.2
+termcolor==2.2.0
+jupyter
+ipykernel
+textract
--- a/apps/chatbot-kickstarter/search.py
+++ b/apps/chatbot-kickstarter/search.py
@ -0,0 +1,39 @@
+import streamlit as st
+import openai 
+
+from database import get_redis_connection, get_redis_results
+from config import INDEX_NAME, COMPLETIONS_MODEL
+
+# initialise Redis connection
+
+client = get_redis_connection()
+
+### SEARCH APP
+
+st.set_page_config(
+    page_title="Streamlit Search - Demo",
+    page_icon=":robot:"
+)
+
+st.title('Formula 1 Search')
+st.subheader("Search for any Formula 1 rule questions you have")
+
+prompt = st.text_input("Enter your search here","", key="input")
+
+if st.button('Submit', key='generationSubmit'):
+    result_df = get_redis_results(client,prompt,INDEX_NAME)
+    
+    # Build a prompt to provide the original query, the result and ask to summarise for the user
+    summary_prompt = '''Summarise this result in a bulleted list to answer the search query a customer has sent.
+    Search query: SEARCH_QUERY_HERE
+    Search result: SEARCH_RESULT_HERE
+    Summary:
+    '''
+    summary_prepped = summary_prompt.replace('SEARCH_QUERY_HERE',prompt).replace('SEARCH_RESULT_HERE',result_df['result'][0])
+    summary = openai.Completion.create(engine=COMPLETIONS_MODEL,prompt=summary_prepped,max_tokens=500)
+    
+    # Response provided by GPT-3
+    st.write(summary['choices'][0]['text'])
+
+    # Option to display raw table instead of summary from GPT-3
+    #st.table(result_df)
--- a/apps/chatbot-kickstarter/transformers.py
+++ b/apps/chatbot-kickstarter/transformers.py
@ -0,0 +1,116 @@
+from typing import Iterator
+from numpy import array, average
+import openai
+import pandas as pd
+import numpy as np
+
+from config import TEXT_EMBEDDING_CHUNK_SIZE, EMBEDDINGS_MODEL
+from database import load_vectors
+
+def get_col_average_from_list_of_lists(list_of_lists):
+    """Return the average of each column in a list of lists."""
+    if len(list_of_lists) == 1:
+        return list_of_lists[0]
+    else:
+        list_of_lists_array = array(list_of_lists)
+        average_embedding = average(list_of_lists_array, axis=0)
+        return average_embedding.tolist()
+
+# Create embeddings for a text using a tokenizer and an OpenAI engine
+
+
+def create_embeddings_for_text(text, tokenizer):
+    """Return a list of tuples (text_chunk, embedding) and an average embedding for a text."""
+    token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE, tokenizer))
+    text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]
+
+    embeddings_response = get_embeddings(text_chunks, EMBEDDINGS_MODEL)
+    embeddings = [embedding["embedding"] for embedding in embeddings_response]
+    text_embeddings = list(zip(text_chunks, embeddings))
+
+    average_embedding = get_col_average_from_list_of_lists(embeddings)
+
+    return (text_embeddings, average_embedding)
+
+def get_embeddings(text_array, engine):
+    return openai.Engine(id=engine).embeddings(input=text_array)["data"]
+
+# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
+def chunks(text, n, tokenizer):
+    tokens = tokenizer.encode(text)
+    """Yield successive n-sized chunks from text."""
+    i = 0
+    while i < len(tokens):
+        # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
+        j = min(i + int(1.5 * n), len(tokens))
+        while j > i + int(0.5 * n):
+            # Decode the tokens and check for full stop or newline
+            chunk = tokenizer.decode(tokens[i:j])
+            if chunk.endswith(".") or chunk.endswith("\n"):
+                break
+            j -= 1
+        # If no end of sentence found, use n tokens as the chunk size
+        if j == i + int(0.5 * n):
+            j = min(i + n, len(tokens))
+        yield tokens[i:j]
+        i = j
+        
+def get_unique_id_for_file_chunk(filename, chunk_index):
+    return str(filename+"-!"+str(chunk_index))
+
+def handle_file_string(file,tokenizer,redis_conn, text_embedding_field,index_name):
+    filename = file[0]
+    file_body_string = file[1]
+
+    # Clean up the file string by replacing newlines and double spaces and semi-colons
+    clean_file_body_string = file_body_string.replace("  ", " ").replace("\n", "; ").replace(';',' ')
+    #
+    # Add the filename to the text to embed
+    text_to_embed = "Filename is: {}; {}".format(
+        filename, clean_file_body_string)
+
+    # Create embeddings for the text
+    try:
+        text_embeddings, average_embedding = create_embeddings_for_text(
+            text_to_embed, tokenizer)
+        #print("[handle_file_string] Created embedding for {}".format(filename))
+    except Exception as e:
+        print("[handle_file_string] Error creating embedding: {}".format(e))
+
+    # Get the vectors array of triples: file_chunk_id, embedding, metadata for each embedding
+    # Metadata is a dict with keys: filename, file_chunk_index
+    vectors = []
+    for i, (text_chunk, embedding) in enumerate(text_embeddings):
+        id = get_unique_id_for_file_chunk(filename, i)
+        vectors.append(({'id': id
+                         , "vector": embedding, 'metadata': {"filename": filename
+                                                              , "text_chunk": text_chunk
+                                                              , "file_chunk_index": i}}))
+
+    try:
+        load_vectors(redis_conn, vectors,text_embedding_field)
+
+    except Exception as e:
+        print(f'Ran into a problem uploading to Redis: {e}')
+
+# Make a class to generate batches for insertion
+class BatchGenerator:
+    
+    
+    def __init__(self, batch_size: int = 10) -> None:
+        self.batch_size = batch_size
+    
+    # Makes chunks out of an input DataFrame
+    def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
+        splits = self.splits_num(df.shape[0])
+        if splits <= 1:
+            yield df
+        else:
+            for chunk in np.array_split(df, splits):
+                yield chunk
+
+    # Determines how many chunks DataFrame contains
+    def splits_num(self, elements: int) -> int:
+        return round(elements / self.batch_size)
+    
+    __call__ = to_batches