Remove apps directory (#776)

pull/777/head
Simón Fishman 7 months ago committed by GitHub
parent 9e09df530d
commit b8a2ddd809
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,35 +0,0 @@
# Powering your products with ChatGPT and your own data
The Chatbot Kickstarter is a starter repo to get you used to building a basic Chatbot using the ChatGPT API and your own knowledge base. The flow you're taken through was originally presented with [these slides](https://drive.google.com/file/d/1dB-RQhZC_Q1iAsHkNNdkqtxxXqYODFYy/view?usp=share_link), which may come in useful to refer to.
This repo contains one notebook and two basic Streamlit apps:
- `powering_your_products_with_chatgpt_and_your_data.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, and building simple Q&A and Chatbot functionality on top.
- `search.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
- `chat.py`: A Streamlit app providing a simple Chatbot via a search bar to query your knowledge base.
To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
## How it works
The notebook is the best place to start, and is broadly laid out as follows:
- **Lay the foundations:**
- Set up the vector database to accept vectors and data
- Load the dataset, chunk the data up for embedding and store in the vector database
- **Make it a product:**
- Add a retrieval step where users provide queries and we return the most relevant entries
- Summarise search results with GPT-3
- Test out this basic Q&A app in Streamlit
- **Build your moat:**
- Create an Assistant class to manage context and interact with our bot
- Use the Chatbot to answer questions using semantic search context
- Test out this basic Chatbot app in Streamlit
Once you've run the notebook and tried the two Streamlit apps, you should be in a position to strip out any useful snippets and start your own Q&A or Chat application.
## Limitations
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
- This is a simple starting point - if you hit issues deploying your use case you may need to tune (non-exhaustive list):
- The prompt and parameters for the model for it to answer accurately
- Your search to return more relevant results
- Your chunking/embedding approach to store the most relevant content effectively for retrieval

@ -1,83 +0,0 @@
import streamlit as st
from streamlit_chat import message
from database import get_redis_connection
from chatbot import RetrievalAssistant, Message
# Initialise database
## Initialise Redis connection
redis_client = get_redis_connection()
# Set instruction
# System prompt requiring Question and Year to be extracted from the user
system_prompt = '''
You are a helpful Formula 1 knowledge base assistant. You need to capture a Question and Year from each customer.
The Question is their query on Formula 1, and the Year is the year of the applicable Formula 1 season.
Think about this step by step:
- The user will ask a Question
- You will ask them for the Year if their question didn't include a Year
- Once you have the Year, say "searching for answers".
Example:
User: I'd like to know the cost cap for a power unit
Assistant: Certainly, what year would you like this for?
User: 2023 please.
Assistant: Searching for answers.
'''
### CHATBOT APP
st.set_page_config(
page_title="Streamlit Chat - Demo",
page_icon=":robot:"
)
st.title('Formula 1 Chatbot')
st.subheader("Help us help you learn about Formula 1")
if 'generated' not in st.session_state:
st.session_state['generated'] = []
if 'past' not in st.session_state:
st.session_state['past'] = []
def query(question):
response = st.session_state['chat'].ask_assistant(question)
return response
prompt = st.text_input(f"What do you want to know: ", key="input")
if st.button('Submit', key='generationSubmit'):
# Initialization
if 'chat' not in st.session_state:
st.session_state['chat'] = RetrievalAssistant()
messages = []
system_message = Message('system',system_prompt)
messages.append(system_message.message())
else:
messages = []
user_message = Message('user',prompt)
messages.append(user_message.message())
response = query(messages)
# Debugging step to print the whole response
#st.write(response)
st.session_state.past.append(prompt)
st.session_state.generated.append(response['content'])
if st.session_state['generated']:
for i in range(len(st.session_state['generated'])-1, -1, -1):
message(st.session_state["generated"][i], key=str(i))
message(st.session_state['past'][i], is_user=True, key=str(i) + '_user')

@ -1,111 +0,0 @@
import openai
from termcolor import colored
import streamlit as st
from database import get_redis_connection, get_redis_results
from config import CHAT_MODEL, COMPLETIONS_MODEL, INDEX_NAME
redis_client = get_redis_connection()
# A basic class to create a message as a dict for chat
class Message:
def __init__(self, role,content):
self.role = role
self.content = content
def message(self):
return {
"role": self.role,
"content": self.content
}
# New Assistant class to add a vector database call to its responses
class RetrievalAssistant:
def __init__(self):
self.conversation_history = []
def _get_assistant_response(self, prompt):
try:
completion = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=prompt,
temperature=0.1
)
response_message = Message(
completion['choices'][0]['message']['role'],
completion['choices'][0]['message']['content']
)
return response_message.message()
except Exception as e:
return f'Request failed with exception {e}'
# The function to retrieve Redis search results
def _get_search_results(self,prompt):
latest_question = prompt
search_content = get_redis_results(
redis_client,latest_question,
INDEX_NAME
)['result'][0]
return search_content
def ask_assistant(self, next_user_prompt):
[self.conversation_history.append(x) for x in next_user_prompt]
assistant_response = self._get_assistant_response(self.conversation_history)
# Answer normally unless the trigger sequence is used "searching_for_answers"
if 'searching for answers' in assistant_response['content'].lower():
question_extract = openai.Completion.create(
model = COMPLETIONS_MODEL,
prompt=f'''
Extract the user's latest question and the year for that question from this
conversation: {self.conversation_history}. Extract it as a sentence stating the Question and Year"
'''
)
search_result = self._get_search_results(question_extract['choices'][0]['text'])
# We insert an extra system prompt here to give fresh context to the Chatbot on how to use the Redis results
# In this instance we add it to the conversation history, but in production it may be better to hide
self.conversation_history.insert(
-1,{
"role": 'system',
"content": f'''
Answer the user's question using this content: {search_result}.
If you cannot answer the question, say 'Sorry, I don't know the answer to this one'
'''
}
)
assistant_response = self._get_assistant_response(
self.conversation_history
)
self.conversation_history.append(assistant_response)
return assistant_response
else:
self.conversation_history.append(assistant_response)
return assistant_response
def pretty_print_conversation_history(
self,
colorize_assistant_replies=True):
for entry in self.conversation_history:
if entry['role']=='system':
pass
else:
prefix = entry['role']
content = entry['content']
if colorize_assistant_replies and entry['role'] == 'assistant':
output = colored(f"{prefix}:\n{content}, green")
else:
output = colored(f"{prefix}:\n{content}")
print(output)

@ -1,7 +0,0 @@
COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDINGS_MODEL = "text-embedding-ada-002"
CHAT_MODEL = 'gpt-3.5-turbo'
TEXT_EMBEDDING_CHUNK_SIZE=300
VECTOR_FIELD_NAME='content_vector'
PREFIX = "sportsdoc"
INDEX_NAME = "f1-index"

@ -1,82 +0,0 @@
import pandas as pd
import numpy as np
import openai
from redis import Redis
from redis.commands.search.field import VectorField
from redis.commands.search.field import TextField, NumericField
from redis.commands.search.query import Query
from config import EMBEDDINGS_MODEL, PREFIX, VECTOR_FIELD_NAME
# Get a Redis connection
def get_redis_connection(host='localhost',port='6379',db=0):
r = Redis(host=host, port=port, db=db,decode_responses=False)
return r
# Create a Redis index to hold our data
def create_hnsw_index (redis_conn,vector_field_name,vector_dimensions=1536, distance_metric='COSINE'):
redis_conn.ft().create_index([
VectorField(vector_field_name, "HNSW", {"TYPE": "FLOAT32", "DIM": vector_dimensions, "DISTANCE_METRIC": distance_metric}),
TextField("filename"),
TextField("text_chunk"),
NumericField("file_chunk_index")
])
# Create a Redis pipeline to load all the vectors and their metadata
def load_vectors(client:Redis, input_list, vector_field_name):
p = client.pipeline(transaction=False)
for text in input_list:
#hash key
key=f"{PREFIX}:{text['id']}"
#hash values
item_metadata = text['metadata']
#
item_keywords_vector = np.array(text['vector'],dtype= 'float32').tobytes()
item_metadata[vector_field_name]=item_keywords_vector
# HSET
p.hset(key,mapping=item_metadata)
p.execute()
# Make query to Redis
def query_redis(redis_conn,query,index_name, top_k=2):
## Creates embedding vector from user query
embedded_query = np.array(openai.Embedding.create(
input=query,
model=EMBEDDINGS_MODEL,
)["data"][0]['embedding'], dtype=np.float32).tobytes()
#prepare the query
q = Query(f'*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]').sort_by('vector_score').paging(0,top_k).return_fields('vector_score','filename','text_chunk','text_chunk_index').dialect(2)
params_dict = {"vec_param": embedded_query}
#Execute the query
results = redis_conn.ft(index_name).search(q, query_params = params_dict)
return results
# Get mapped documents from Weaviate results
def get_redis_results(redis_conn,query,index_name):
# Get most relevant documents from Redis
query_result = query_redis(redis_conn,query,index_name)
# Extract info into a list
query_result_list = []
for i, result in enumerate(query_result.docs):
result_order = i
text = result.text_chunk
score = result.vector_score
query_result_list.append((result_order,text,score))
# Display result as a DataFrame for ease of us
result_df = pd.DataFrame(query_result_list)
result_df.columns = ['id','result','certainty']
return result_df

@ -1,12 +0,0 @@
numpy==1.24.2
openai==0.27.1
pandas==1.5.3
redis==4.5.4
requests==2.31.0
streamlit==1.20.0
streamlit_chat==0.0.2.2
termcolor==2.2.0
jupyter
ipykernel
textract
tiktoken

@ -1,39 +0,0 @@
import streamlit as st
import openai
from database import get_redis_connection, get_redis_results
from config import INDEX_NAME, COMPLETIONS_MODEL
# initialise Redis connection
client = get_redis_connection()
### SEARCH APP
st.set_page_config(
page_title="Streamlit Search - Demo",
page_icon=":robot:"
)
st.title('Formula 1 Search')
st.subheader("Search for any Formula 1 rule questions you have")
prompt = st.text_input("Enter your search here","", key="input")
if st.button('Submit', key='generationSubmit'):
result_df = get_redis_results(client,prompt,INDEX_NAME)
# Build a prompt to provide the original query, the result and ask to summarise for the user
summary_prompt = '''Summarise this result in a bulleted list to answer the search query a customer has sent.
Search query: SEARCH_QUERY_HERE
Search result: SEARCH_RESULT_HERE
Summary:
'''
summary_prepped = summary_prompt.replace('SEARCH_QUERY_HERE',prompt).replace('SEARCH_RESULT_HERE',result_df['result'][0])
summary = openai.Completion.create(engine=COMPLETIONS_MODEL,prompt=summary_prepped,max_tokens=500)
# Response provided by GPT-3
st.write(summary['choices'][0]['text'])
# Option to display raw table instead of summary from GPT-3
#st.table(result_df)

@ -1,131 +0,0 @@
from typing import Iterator
from numpy import array, average
import openai
import pandas as pd
import numpy as np
from config import TEXT_EMBEDDING_CHUNK_SIZE, EMBEDDINGS_MODEL
from database import load_vectors
def get_col_average_from_list_of_lists(list_of_lists):
"""Return the average of each column in a list of lists."""
if len(list_of_lists) == 1:
return list_of_lists[0]
else:
list_of_lists_array = array(list_of_lists)
average_embedding = average(list_of_lists_array, axis=0)
return average_embedding.tolist()
# Create embeddings for a text using a tokenizer and an OpenAI engine
def create_embeddings_for_text(text, tokenizer):
"""Return a list of tuples (text_chunk, embedding) and an average embedding for a text."""
token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE, tokenizer))
text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]
embeddings_response = get_embeddings(text_chunks, EMBEDDINGS_MODEL)
embeddings = [embedding["embedding"] for embedding in embeddings_response]
text_embeddings = list(zip(text_chunks, embeddings))
average_embedding = get_col_average_from_list_of_lists(embeddings)
return (text_embeddings, average_embedding)
def get_embeddings(text_array, engine):
return openai.Engine(id=engine).embeddings(input=text_array)["data"]
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def chunks(text, n, tokenizer):
"""Yield successive n-sized chunks from text."""
tokens = tokenizer.encode(text)
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j
def get_unique_id_for_file_chunk(filename, chunk_index):
return str(filename+"-!"+str(chunk_index))
def handle_file_string(file, tokenizer, redis_conn, text_embedding_field, index_name):
"""
Handle a file string by cleaning it up, creating embeddings, and uploading them to Redis.
Args:
file (tuple): A tuple containing the filename and file body string.
tokenizer: The tokenizer object to use for encoding and decoding text.
redis_conn: The Redis connection object.
text_embedding_field (str): The field in Redis where the text embeddings will be stored.
index_name: The name of the index or identifier for the embeddings.
Returns:
None
Raises:
Exception: If there is an error creating embeddings or uploading to Redis.
"""
filename = file[0]
file_body_string = file[1]
# Clean up the file string by replacing newlines, double spaces, and semi-colons
clean_file_body_string = file_body_string.replace(" ", " ").replace("\n", "; ").replace(';', ' ')
# Add the filename to the text to embed
text_to_embed = "Filename is: {}; {}".format(filename, clean_file_body_string)
try:
# Create embeddings for the text
text_embeddings, average_embedding = create_embeddings_for_text(text_to_embed, tokenizer)
# print("[handle_file_string] Created embedding for {}".format(filename))
except Exception as e:
print("[handle_file_string] Error creating embedding: {}".format(e))
# Get the vectors array of triples: file_chunk_id, embedding, metadata for each embedding
# Metadata is a dict with keys: filename, file_chunk_index
vectors = []
for i, (text_chunk, embedding) in enumerate(text_embeddings):
id = get_unique_id_for_file_chunk(filename, i)
vectors.append({'id': id, "vector": embedding, 'metadata': {"filename": filename,
"text_chunk": text_chunk,
"file_chunk_index": i}})
try:
# Load vectors into Redis
load_vectors(redis_conn, vectors, text_embedding_field)
except Exception as e:
print(f'Ran into a problem uploading to Redis: {e}')
# Make a class to generate batches for insertion
class BatchGenerator:
def __init__(self, batch_size: int = 10) -> None:
self.batch_size = batch_size
# Makes chunks out of an input DataFrame
def to_batches(self, df: pd.DataFrame) -> Iterator[pd.DataFrame]:
splits = self.splits_num(df.shape[0])
if splits <= 1:
yield df
else:
for chunk in np.array_split(df, splits):
yield chunk
# Determines how many chunks DataFrame contains
def splits_num(self, elements: int) -> int:
return round(elements / self.batch_size)
__call__ = to_batches

@ -1,42 +0,0 @@
# Embeddings Playground
[`embeddings_playground.py`](embeddings_playground.py) is a single-page streamlit app for experimenting with OpenAI embeddings.
## Installation
Before running, install required dependencies with:
`pip install -r apps/embeddings-playground/requirements.txt`
(You may need to change the path to match your local path.)
Verify installation of streamlit with `streamlit hello`.
## Usage
Run the script with:
`streamlit run apps/embeddings-playground/embeddings_playground.py`
(Again, you may need to change the path to match your local path.)
In the app, first select your choice of:
- distance metric (we recommend cosine)
- embedding model (we recommend `text-embedding-ada-002` for most use cases, as of May 2023)
Then, enter a variable number of strings to compare. Click `rank` to see:
- the ranked list of strings, sorted by distance from the first string
- a heatmap showing the distance between each pair of strings
## Example
Here's an example distance matrix for 8 example strings related to `The sky is blue`:
![example distance matrix](example_distance_matrix.png)
From these distance pairs, you can see:
- embeddings measure topical similarity more than logical similarity (e.g., `The sky is blue` is very close to `The sky is not blue`)
- punctuation affects embeddings (e.g., `"THE. SKY. IS. BLUE!"` is only third closest to `The sky is blue`)
- within-language pairs are stronger than across-language pairs (e.g., `El cielo as azul` is closer to `El cielo es rojo` than to `The sky is blue`)
Experiment with your own strings to see what you can learn.

@ -1,178 +0,0 @@
"""
EMBEDDINGS PLAYGROUND
This is a single-page streamlit app for experimenting with OpenAI embeddings.
Before running, install required dependencies with:
`pip install -r apps/embeddings-playground/requirements.txt`
You may need to change the path to match your local path.
Verify installation of streamlit with `streamlit hello`.
Run this script with:
`streamlit run apps/embeddings-playground/embeddings_playground.py`
Again, you may need to change the path to match your local path.
"""
# IMPORTS
import altair as alt
import openai
import os
import pandas as pd
from scipy import spatial
import streamlit as st
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential,
)
# FUNCTIONS
# get embeddings
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
@st.cache_data
def embedding_from_string(input: str, model: str) -> list:
response = openai.Embedding.create(input=input, model=model)
embedding = response["data"][0]["embedding"]
return embedding
# plot distance matrix
def plot_distance_matrix(strings: list, engine: str, distance: str):
# create dataframe of embedding distances
df = pd.DataFrame({"string": strings, "index": range(len(strings))})
df["embedding"] = df["string"].apply(lambda string: embedding_from_string(string, engine))
df["string"] = df.apply(lambda row: f"({row['index'] + 1}) {row['string']}", axis=1)
df["dummy_key"] = 0
df = pd.merge(df, df, on="dummy_key", suffixes=("_1", "_2")).drop("dummy_key", axis=1)
df = df[df["string_1"] != df["string_2"]] # filter out diagonal (always 0)
df["distance"] = df.apply(
lambda row: distance_metrics[distance](row["embedding_1"], row["embedding_2"]),
axis=1,
)
df["label"] = df["distance"].apply(lambda d: f"{d:.2f}")
# set chart params
text_size = 32
label_size = 16
pixels_per_string = 80 # aka row height & column width (perpendicular to text)
max_label_width = 256 # in pixels, not characters, I think?
chart_width = (
50
+ min(max_label_width, max(df["string_1"].apply(len) * label_size/2))
+ len(strings) * pixels_per_string
)
# extract chart parameters from data
color_min = df["distance"].min()
color_max = 1.5 * df["distance"].max()
x_order = df["string_1"].values
ranked = False
if ranked:
ranked_df = df[(df["string_1"] == f"(1) {strings[0]}")].sort_values(by="distance")
y_order = ranked_df["string_2"].values
else:
y_order = x_order
# create chart
boxes = (
alt.Chart(df, title=f"{engine}")
.mark_rect()
.encode(
x=alt.X("string_1", title=None, sort=x_order),
y=alt.Y("string_2", title=None, sort=y_order),
color=alt.Color("distance:Q", title=f"{distance} distance", scale=alt.Scale(domain=[color_min,color_max], scheme="darkblue", reverse=True)),
)
)
labels = (
boxes.mark_text(align="center", baseline="middle", fontSize=text_size)
.encode(text="label")
.configure_axis(labelLimit=max_label_width, labelFontSize=label_size)
.properties(width=chart_width, height=chart_width)
)
st.altair_chart(labels) # note: layered plots are not supported in streamlit :(
# PAGE
st.title("OpenAI Embeddings Playground")
# get API key
try:
openai.api_key = os.getenv("OPENAI_API_KEY")
st.write(f"API key sucessfully retrieved: {openai.api_key[:3]}...{openai.api_key[-4:]}")
except:
st.header("Enter API Key")
openai.api_key = st.text_input("API key")
# select distance metric
st.header("Select distance metric")
distance_metrics = {
"cosine": spatial.distance.cosine,
"L1 (cityblock)": spatial.distance.cityblock,
"L2 (euclidean)": spatial.distance.euclidean,
"Linf (chebyshev)": spatial.distance.chebyshev,
#'correlation': spatial.distance.correlation, # not sure this makes sense for individual vectors - looks like cosine
}
distance_metric_options = list(distance_metrics.keys())
distance = st.radio("Distance metric", distance_metric_options)
# select models
st.header("Select models")
models = [
"text-embedding-ada-002",
"text-similarity-ada-001",
"text-similarity-babbage-001",
"text-similarity-curie-001",
"text-similarity-davinci-001",
]
prechecked_models = [
"text-embedding-ada-002"
]
model_values = [st.checkbox(model, key=model, value=(model in prechecked_models)) for model in models]
# enter strings
st.header("Enter strings")
strings = []
if "num_boxes" not in st.session_state:
st.session_state.num_boxes = 5
if st.session_state.num_boxes > 2:
if st.button("Remove last text box"):
st.session_state.num_boxes -= 1
if st.button("Add new text box"):
st.session_state.num_boxes += 1
for i in range(st.session_state.num_boxes):
string = st.text_input(f"String {i+1}")
strings.append(string)
# rank strings
st.header("Rank strings by relatedness")
if st.button("Rank"):
# display a dataframe comparing rankings to string #1
st.subheader("Rankings")
ranked_strings = {}
for model, value in zip(models, model_values):
if value:
query_embedding = embedding_from_string(strings[0], model)
df = pd.DataFrame({"string": strings})
df[model] = df["string"].apply(lambda string: embedding_from_string(string, model))
df["distance"] = df[model].apply(
lambda embedding: distance_metrics[distance](query_embedding, embedding)
)
df = df.sort_values(by="distance")
ranked_strings[model] = df["string"].values
df = pd.DataFrame(ranked_strings)
st.dataframe(df)
# display charts of all the pairwise distances between strings
st.subheader("Distance matrices")
for model, value in zip(models, model_values):
if value:
plot_distance_matrix(strings, model, distance)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 409 KiB

@ -1,6 +0,0 @@
altair
openai
pandas
scipy
streamlit
tenacity

@ -1,36 +0,0 @@
# Enterprise Knowledge Retrieval
This app is a deep dive on Enterprise Knowledge Retrieval, which aims to take some unstructured text documents and create a usable knowledge base application with it.
This repo contains a notebook and a basic Streamlit app:
- `enterprise_knowledge_retrieval.ipynb`: A notebook containing a step by step process of tokenising, chunking and embedding your data in a vector database, building a chat agent on top and running a basic evaluation of its performance.
- `chatbot.py`: A Streamlit app providing simple Q&A via a search bar to query your knowledge base.
To run the app, please follow the instructions below in the ```App``` section
## Notebook
The notebook is the best place to start, and takes you through an end-to-end workflow for setting up and evaluating a simple back-end knowledge retrieval service:
- **Setup:** Initiate variables and connect to a vector database.
- **Storage:** Configure the database, prepare our data and store embeddings and metadata for retrieval.
- **Search:** Extract relevant documents back out with a basic search function and use an LLM to summarise results into a concise reply.
- **Answer:** Add a more sophisticated agent which will process the user's query and maintain a memory for follow-up questions.
- **Evaluate:** Take question/answer pairs using our service, evaluate and plot them to scope out remedial action
Once you've run the notebook through to the Search stage, you should have what you need to set up and run the app.
## App
We've rolled in a basic Streamlit app that you can interact with to test your retrieval service using either standard semantic search or [HyDE](https://arxiv.org/abs/2212.10496) retrievals.
To use it:
- Ensure you followed the Setup and Storage steps from the notebook to populate a vector database with searchable content.
- Set up a virtual environment with pip by running ```virtualenv venv``` (ensure ```virtualenv``` is installed).
- Activate the environment by running ```source venv/bin/activate```.
- Install requirements by running ```pip install -r requirements.txt```.
- Run ```streamlit run chatbot.py``` to fire up the Streamlit app in your browser.
## Limitations
- This app uses Redis as a vector database, but there are many other options highlighted `../examples/vector_databases` depending on your need.
- We introduce many areas you may optimize in the notebook, but we'll deep dive on these in subsequent cookbooks.

@ -1,183 +0,0 @@
from langchain.agents import (
Tool,
AgentExecutor,
LLMSingleActionAgent,
AgentOutputParser,
)
from langchain.prompts import BaseChatPromptTemplate
from langchain import SerpAPIWrapper, LLMChain
from langchain.chat_models import ChatOpenAI
from typing import List, Union
from langchain.schema import AgentAction, AgentFinish, HumanMessage
from langchain.memory import ConversationBufferWindowMemory
import openai
import re
import streamlit as st
from database import get_redis_results, get_redis_connection
from config import RETRIEVAL_PROMPT, CHAT_MODEL, INDEX_NAME, SYSTEM_PROMPT
redis_client = get_redis_connection()
def answer_user_question(query):
results = get_redis_results(redis_client, query, INDEX_NAME)
results.to_csv("results.csv")
search_content = ""
for x, y in results.head(3).iterrows():
search_content += y["title"] + "\n" + y["result"] + "\n\n"
retrieval_prepped = RETRIEVAL_PROMPT.format(
SEARCH_QUERY_HERE=query, SEARCH_CONTENT_HERE=search_content
)
retrieval = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[{"role": "user", "content": retrieval_prepped}],
max_tokens=500,
)
# Response provided by GPT-3.5
return retrieval["choices"][0]["message"]["content"]
def answer_question_hyde(query):
hyde_prompt = """You are OracleGPT, an helpful expert who answers user questions to the best of their ability.
Provide a confident answer to their question. If you don't know the answer, make the best guess you can based on the context of the question.
User question: {USER_QUESTION_HERE}
Answer:"""
hypothetical_answer = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[
{
"role": "user",
"content": hyde_prompt.format(USER_QUESTION_HERE=query),
}
],
)["choices"][0]["message"]["content"]
# st.write(hypothetical_answer)
results = get_redis_results(redis_client, hypothetical_answer, INDEX_NAME)
results.to_csv("results.csv")
search_content = ""
for x, y in results.head(3).iterrows():
search_content += y["title"] + "\n" + y["result"] + "\n\n"
retrieval_prepped = RETRIEVAL_PROMPT.replace("SEARCH_QUERY_HERE", query).replace(
"SEARCH_CONTENT_HERE", search_content
)
retrieval = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[{"role": "user", "content": retrieval_prepped}],
max_tokens=500,
)
return retrieval["choices"][0]["message"]["content"]
# Set up a prompt template
class CustomPromptTemplate(BaseChatPromptTemplate):
# The template to use
template: str
# The list of tools available
tools: List[Tool]
def format_messages(self, **kwargs) -> str:
# Get the intermediate steps (AgentAction, Observation tuples)
# Format them in a particular way
intermediate_steps = kwargs.pop("intermediate_steps")
thoughts = ""
for action, observation in intermediate_steps:
thoughts += action.log
thoughts += f"\nObservation: {observation}\nThought: "
# Set the agent_scratchpad variable to that value
kwargs["agent_scratchpad"] = thoughts
# Create a tools variable from the list of tools provided
kwargs["tools"] = "\n".join(
[f"{tool.name}: {tool.description}" for tool in self.tools]
)
# Create a list of tool names for the tools provided
kwargs["tool_names"] = ", ".join([tool.name for tool in self.tools])
formatted = self.template.format(**kwargs)
return [HumanMessage(content=formatted)]
class CustomOutputParser(AgentOutputParser):
def parse(self, llm_output: str) -> Union[AgentAction, AgentFinish]:
# Check if agent should finish
if "Final Answer:" in llm_output:
return AgentFinish(
# Return values is generally always a dictionary with a single `output` key
# It is not recommended to try anything else at the moment :)
return_values={"output": llm_output.split("Final Answer:")[-1].strip()},
log=llm_output,
)
# Parse out the action and action input
regex = r"Action\s*\d*\s*:(.*?)\nAction\s*\d*\s*Input\s*\d*\s*:[\s]*(.*)"
match = re.search(regex, llm_output, re.DOTALL)
if not match:
raise ValueError(f"Could not parse LLM output: `{llm_output}`")
action = match.group(1).strip()
action_input = match.group(2)
# Return the action and action input
return AgentAction(
tool=action, tool_input=action_input.strip(" ").strip('"'), log=llm_output
)
def initiate_agent(tools):
prompt = CustomPromptTemplate(
template=SYSTEM_PROMPT,
tools=tools,
# This omits the `agent_scratchpad`, `tools`, and `tool_names` variables because those are generated dynamically
# The history template includes "history" as an input variable so we can interpolate it into the prompt
input_variables=["input", "intermediate_steps", "history"],
)
# Initiate the memory with k=2 to keep the last two turns
# Provide the memory to the agent
memory = ConversationBufferWindowMemory(k=2)
output_parser = CustomOutputParser()
llm = ChatOpenAI(temperature=0)
# LLM chain consisting of the LLM and a prompt
llm_chain = LLMChain(llm=llm, prompt=prompt)
tool_names = [tool.name for tool in tools]
agent = LLMSingleActionAgent(
llm_chain=llm_chain,
output_parser=output_parser,
stop=["\nObservation:"],
allowed_tools=tool_names,
)
agent_executor = AgentExecutor.from_agent_and_tools(
agent=agent, tools=tools, verbose=True, memory=memory
)
return agent_executor
def ask_gpt(query):
response = openai.ChatCompletion.create(
model=CHAT_MODEL,
messages=[
{
"role": "user",
"content": "Please answer my question.\nQuestion: {}".format(query),
}
],
temperature=0,
)
return response["choices"][0]["message"]["content"]

@ -1,86 +0,0 @@
from langchain.agents import Tool
import pandas as pd
import streamlit as st
from streamlit_chat import message
from database import get_redis_connection
from assistant import (
answer_user_question,
initiate_agent,
answer_question_hyde,
ask_gpt,
)
# Initialise database
## Initialise Redis connection
redis_client = get_redis_connection()
### CHATBOT APP
# --- GENERAL SETTINGS ---
PAGE_TITLE: str = "Knowledge Retrieval Bot"
PAGE_ICON: str = "🤖"
st.set_page_config(page_title=PAGE_TITLE, page_icon=PAGE_ICON)
st.title("Wiki Chatbot")
st.subheader("Learn things - random things!")
# Using object notation
add_selectbox = st.sidebar.selectbox(
"What kind of search?", ("Standard vector search", "HyDE")
)
# Define which tools the agent can use to answer user queries
tools = [
Tool(
name="Search",
func=answer_user_question
if add_selectbox == "Standard vector search"
else answer_question_hyde,
description="Useful for when you need to answer general knowledge questions. Input should be a fully formed question.",
),
Tool(
name="Ask",
func=ask_gpt,
description="Useful if the question is not general knowledge. Input should be a fully formed question.",
),
]
if "generated" not in st.session_state:
st.session_state["generated"] = []
if "past" not in st.session_state:
st.session_state["past"] = []
def query(question):
response = st.session_state["chat"].ask_assistant(question)
return response
prompt = st.text_input("What do you want to know: ", "", key="input")
if st.button("Submit", key="generationSubmit"):
with st.spinner("Thinking..."):
# Initialization
if "agent" not in st.session_state:
st.session_state["agent"] = initiate_agent(tools)
response = st.session_state["agent"].run(prompt)
st.session_state.past.append(prompt)
st.session_state.generated.append(response)
if len(st.session_state["generated"]) > 0:
for i in range(len(st.session_state["generated"]) - 1, -1, -1):
message(st.session_state["generated"][i], key=str(i))
message(st.session_state["past"][i], is_user=True, key=str(i) + "_user")
with st.expander("See search results"):
results = list(pd.read_csv("results.csv")["result"])
st.write(results)

@ -1,46 +0,0 @@
REDIS_HOST = "localhost"
REDIS_PORT = "6379"
REDIS_DB = "0"
INDEX_NAME = "wiki-index"
VECTOR_FIELD_NAME = "content_vector"
CHAT_MODEL = "gpt-3.5-turbo"
EMBEDDINGS_MODEL = "text-embedding-ada-002"
# Set up the base template
SYSTEM_PROMPT = """You are WikiGPT, a helpful bot who has access to a database of Wikipedia data to answer questions.
Accept the first answer that you are provided for the user.
You have access to the following tools::
{tools}
Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin! Remember to give detailed, informative answers
Previous conversation history:
{history}
New question: {input}
{agent_scratchpad}"""
# Build a prompt to provide the original query, the result and ask to summarise for the user
RETRIEVAL_PROMPT = """Use the content to answer the search query the customer has sent. Provide the source for your answer.
If you can't answer the user's question, say "Sorry, I am unable to answer the question with the content". Do not guess.
Search query:
{SEARCH_QUERY_HERE}
Content:
{SEARCH_CONTENT_HERE}
Answer:
"""

@ -1,72 +0,0 @@
import ast
from math import isnan
import numpy as np
import pandas as pd
import openai
from redis import Redis as r
from redis.commands.search.query import Query
from config import (
REDIS_DB,
REDIS_HOST,
REDIS_PORT,
VECTOR_FIELD_NAME,
EMBEDDINGS_MODEL,
INDEX_NAME,
)
def get_redis_connection():
redis_client = r(
host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB, decode_responses=False
)
return redis_client
# Make query to Redis
def query_redis(redis_conn, query, index_name, top_k=5):
## Creates embedding vector from user query
embedded_query = np.array(
openai.Embedding.create(input=query, model=EMBEDDINGS_MODEL,)["data"][
0
]["embedding"],
dtype=np.float32,
).tobytes()
# prepare the query
q = (
Query(f"*=>[KNN {top_k} @{VECTOR_FIELD_NAME} $vec_param AS vector_score]")
.sort_by("vector_score")
.paging(0, top_k)
.return_fields("vector_score", "url", "title", "content", "text_chunk_index")
.dialect(2)
)
params_dict = {"vec_param": embedded_query}
# Execute the query
results = redis_conn.ft(index_name).search(q, query_params=params_dict)
return results
# Get mapped documents from Redis results
def get_redis_results(redis_conn, query, index_name):
# Get most relevant documents from Redis
query_result = query_redis(redis_conn, query, index_name)
# Extract info into a list
query_result_list = []
for i, result in enumerate(query_result.docs):
result_order = i
url = result.url
title = result.title
text = result.content
score = result.vector_score
query_result_list.append((result_order, url, title, text, score))
# Display result as a DataFrame for ease of us
result_df = pd.DataFrame(query_result_list)
result_df.columns = ["id", "url", "title", "result", "certainty"]
return result_df

Binary file not shown.

Before

Width:  |  Height:  |  Size: 72 KiB

@ -1,7 +0,0 @@
langchain==0.0.312
numpy==1.24.2
openai==0.27.4
pandas==2.0.0
redis==4.5.4
streamlit==1.22.0
streamlit_chat==0.0.2.2

@ -1,18 +0,0 @@
# File Q&A
File Q&A is a [Next.js](https://nextjs.org/) app that lets you find answers in your files using OpenAI APIs. You can upload files and ask questions related to their content, and the app will use embeddings and GPT to generate answers from the most relevant files.
This repo contains two versions of the app:
- `/nextjs`: A standalone Next.js app that stores embeddings locally in the browser. You will need an OpenAI API key to use this app. Read more in its [README](./nextjs/README.md).
- `/nextjs-with-flask-server`: A Next.js app that uses a Flask server as a proxy to access the OpenAI APIs, and Pinecone as a vector database to store embeddings. You will need an OpenAI API key and a Pinecone API key to use this app. Read more in its [README](./nextjs-with-flask-server/README.md).
To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
## How it works
When a file is uploaded, text is extracted from the file. This text is then split into shorter text chunks, and an embedding is created for each text chunk. When the user asks a question, an embedding is created for the question, and a similarity search is performed to find the file chunk embeddings that are most similar to the question (i.e. have highest cosine similarities with the question embedding). An API call is then made to the completions endpoint, with the question and the most relevant file chunks are included in the prompt. The generative model then gives the answer to the question found in the file chunks, if the answer can be found in the extracts.
## Limitations
The app may sometimes generate answers that are not in the files, or hallucinate about the existence of files that are not uploaded.

@ -1,47 +0,0 @@
# File Q&A with Next.js and Flask
File Q&A is a web app that lets you find answers in your files. You can upload files and ask questions related to their content, and the app will use embeddings and GPT to generate answers from the most relevant files. \
## Requirements
To run the app, you need:
- An OpenAI API key. You can create a new API key [here](https://beta.openai.com/account/api-keys).
- A Pinecone API key and index name. You can create a new account and index [here](https://www.pinecone.io/).
- Python 3.7 or higher and pipenv for the Flask server.
- Node.js and npm for the Next.js client.
## Set-Up and Development
### Server
Fill out the config.yaml file with your Pinecone API key, index name and environment.
Run the Flask server:
```
cd server
bash script/start "<your OPENAI_API_KEY>"
```
### Client
Navigate to the client directory and install Node dependencies:
```
cd client
npm install
```
Run the Next.js client:
```
cd client
npm run dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the app.
## Limitations
The app may sometimes generate answers that are not in the files, or hallucinate about the existence of files that are not uploaded.

@ -1,3 +0,0 @@
{
"extends": "next/core-web-vitals"
}

@ -1,36 +0,0 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# local env files
.env*.local
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts

@ -1,6 +0,0 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
}
module.exports = nextConfig

File diff suppressed because it is too large Load Diff

@ -1,44 +0,0 @@
{
"name": "file-q-and-a",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@headlessui/react": "^1.7.7",
"@heroicons/react": "^2.0.13",
"@next/font": "13.1.2",
"@tailwindcss/line-clamp": "^0.4.2",
"@tailwindcss/typography": "^0.5.9",
"@types/formidable": "^2.0.5",
"@types/lodash": "^4.14.191",
"@types/node": "18.11.18",
"@types/pdf-parse": "^1.1.1",
"@types/react": "18.0.27",
"@types/react-dom": "18.0.10",
"axios": "^1.2.3",
"clsx": "^1.2.1",
"eslint": "8.32.0",
"eslint-config-next": "13.1.2",
"formidable": "^2.1.1",
"lodash": "^4.17.21",
"mammoth": "^1.5.1",
"next": "13.1.2",
"node-html-markdown": "^1.3.0",
"openai": "^3.1.0",
"pdf-parse": "^1.1.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-markdown": "^8.0.5",
"typescript": "4.9.4"
},
"devDependencies": {
"autoprefixer": "^10.4.13",
"postcss": "^8.4.31",
"tailwindcss": "^3.2.4"
}
}

@ -1,6 +0,0 @@
module.exports = {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};

Binary file not shown.

Before

Width:  |  Height:  |  Size: 262 KiB

@ -1 +0,0 @@
<svg id="openai-horizontal" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 120 29.53"><path d="M40.7,6.98s-.05,0-.07,0c-.02,0-.05,0-.07,0-4.67,0-7.58,2.91-7.58,7.6v2.53c0,4.69,2.9,7.6,7.58,7.6,.02,0,.05,0,.07,0,.02,0,.05,0,.07,0,4.67,0,7.58-2.91,7.58-7.6v-2.53c0-4.69-2.91-7.6-7.58-7.6Zm4.31,10.31c0,3.08-1.6,4.86-4.38,4.89-2.78-.03-4.38-1.81-4.38-4.89v-2.88c0-3.08,1.6-4.86,4.38-4.89,2.78,.03,4.38,1.81,4.38,4.89v2.88Zm40.57-5.79s-.06,0-.09,0c-.02,0-.03,0-.05,0-1.77,0-3.03,.6-3.65,1.75l-.19,.35v-1.8h-3.02v12.56h3.17v-7.48c0-1.76,.95-2.77,2.59-2.8,1.57,.03,2.47,1.02,2.47,2.73v7.55h3.17v-8.09c0-2.99-1.64-4.77-4.39-4.77Zm34.42-1.77v-2.4h-10.46v2.4h3.67v12.22h-3.67v2.4h10.46v-2.4h-3.67V9.73h3.67Zm-18.75-2.4h0s-3.28,0-3.28,0l-6.1,17.04h3.43l1.17-3.65h6.66v.04s1.17,3.62,1.17,3.62h3.43l-6.11-17.04h-.36Zm-4.03,10.98l2.57-8.05,2.55,8.05h-5.12Zm-39.45-6.81s-.05,0-.07,0c-.03,0-.05,0-.07,0-1.59,0-2.96,.66-3.68,1.76l-.18,.28v-1.74h-3.02V28.69h3.17v-5.9l.18,.27c.68,1.01,2.01,1.61,3.56,1.61,.03,0,.05,0,.08,0,.02,0,.04,0,.07,0,2.61,0,5.24-1.7,5.24-5.51v-2.14c0-2.74-1.62-5.51-5.26-5.51Zm2.1,7.5c0,2-1.15,3.24-3.01,3.28-1.73-.03-2.94-1.35-2.94-3.23v-1.89c0-1.9,1.22-3.24,2.97-3.28,1.84,.03,2.98,1.28,2.98,3.28v1.84Zm11.05-7.5h0c-.06,0-.12,.01-.18,.01-.06,0-.12-.01-.18-.01h0c-3.57,0-5.78,2.23-5.78,5.81v1.76c0,3.45,2.24,5.59,5.83,5.59,.08,0,.15,0,.22-.01,.05,0,.09,.01,.14,.01,2.41,0,4.09-.88,5.16-2.7l-2.13-1.23c-.71,1.05-1.66,1.84-3.02,1.84-1.82,0-2.91-1.12-2.91-3.01v-.5h8.44v-2.08c0-3.34-2.19-5.49-5.59-5.49Zm-2.86,5.54v-.3c0-2,.95-3.12,2.68-3.2,1.66,.08,2.66,1.18,2.66,2.99v.5s-5.34,0-5.34,0Z"></path><path d="M27.21,12.08c.67-2.01,.44-4.21-.63-6.04-1.61-2.8-4.85-4.24-8.01-3.57C17.16,.89,15.14-.01,13.02,0c-3.23,0-6.1,2.08-7.1,5.15-2.08,.43-3.87,1.73-4.92,3.57-1.62,2.8-1.25,6.32,.92,8.72-.67,2.01-.44,4.21,.63,6.03,1.61,2.81,4.85,4.25,8.02,3.58,1.4,1.58,3.42,2.49,5.54,2.48,3.23,0,6.1-2.08,7.1-5.15,2.08-.43,3.87-1.73,4.91-3.57,1.63-2.8,1.26-6.32-.91-8.72Zm-2.3-5.07c.64,1.12,.88,2.43,.66,3.7-.04-.03-.12-.07-.17-.1l-5.88-3.4c-.3-.17-.67-.17-.97,0l-6.89,3.98v-2.92l5.69-3.29c2.65-1.53,6.03-.62,7.56,2.03Zm-13.25,6.07l2.9-1.68,2.9,1.68v3.35l-2.9,1.68-2.9-1.68v-3.35ZM13.01,1.93c1.3,0,2.55,.45,3.55,1.28-.04,.02-.12,.07-.18,.1l-5.88,3.39c-.3,.17-.48,.49-.48,.84v7.96l-2.53-1.46V7.46c0-3.06,2.47-5.53,5.53-5.54ZM2.68,9.69h0c.65-1.12,1.66-1.98,2.88-2.43v6.99c0,.35,.18,.66,.48,.84l6.88,3.97-2.54,1.47-5.68-3.28c-2.64-1.53-3.55-4.91-2.02-7.56Zm1.55,12.83h0c-.65-1.11-.88-2.43-.66-3.7,.04,.03,.12,.07,.17,.1l5.88,3.4c.3,.17,.67,.17,.97,0l6.88-3.98v2.92l-5.69,3.28c-2.65,1.52-6.03,.62-7.56-2.02Zm11.89,5.08c-1.29,0-2.55-.45-3.54-1.28,.04-.02,.13-.07,.18-.1l5.88-3.39c.3-.17,.49-.49,.48-.84v-7.95l2.53,1.46v6.57c0,3.06-2.48,5.54-5.53,5.54Zm10.34-7.76c-.65,1.12-1.67,1.98-2.88,2.42v-6.99c0-.35-.18-.67-.48-.84h0l-6.89-3.98,2.53-1.46,5.69,3.28c2.65,1.53,3.55,4.91,2.02,7.56Z"></path></svg>

Before

Width:  |  Height:  |  Size: 2.8 KiB

@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" id="openai-symbol" viewBox="0 0 32 32"><path d="M29.71,13.09A8.09,8.09,0,0,0,20.34,2.68a8.08,8.08,0,0,0-13.7,2.9A8.08,8.08,0,0,0,2.3,18.9,8,8,0,0,0,3,25.45a8.08,8.08,0,0,0,8.69,3.87,8,8,0,0,0,6,2.68,8.09,8.09,0,0,0,7.7-5.61,8,8,0,0,0,5.33-3.86A8.09,8.09,0,0,0,29.71,13.09Zm-12,16.82a6,6,0,0,1-3.84-1.39l.19-.11,6.37-3.68a1,1,0,0,0,.53-.91v-9l2.69,1.56a.08.08,0,0,1,.05.07v7.44A6,6,0,0,1,17.68,29.91ZM4.8,24.41a6,6,0,0,1-.71-4l.19.11,6.37,3.68a1,1,0,0,0,1,0l7.79-4.49V22.8a.09.09,0,0,1,0,.08L13,26.6A6,6,0,0,1,4.8,24.41ZM3.12,10.53A6,6,0,0,1,6.28,7.9v7.57a1,1,0,0,0,.51.9l7.75,4.47L11.85,22.4a.14.14,0,0,1-.09,0L5.32,18.68a6,6,0,0,1-2.2-8.18Zm22.13,5.14-7.78-4.52L20.16,9.6a.08.08,0,0,1,.09,0l6.44,3.72a6,6,0,0,1-.9,10.81V16.56A1.06,1.06,0,0,0,25.25,15.67Zm2.68-4-.19-.12-6.36-3.7a1,1,0,0,0-1.05,0l-7.78,4.49V9.2a.09.09,0,0,1,0-.09L19,5.4a6,6,0,0,1,8.91,6.21ZM11.08,17.15,8.38,15.6a.14.14,0,0,1-.05-.08V8.1a6,6,0,0,1,9.84-4.61L18,3.6,11.61,7.28a1,1,0,0,0-.53.91ZM12.54,14,16,12l3.47,2v4L16,20l-3.47-2Z"/></svg>

Before

Width:  |  Height:  |  Size: 1.0 KiB

@ -1,77 +0,0 @@
import { useState, useCallback, memo } from "react";
import { Transition } from "@headlessui/react";
import {
MagnifyingGlassMinusIcon,
MagnifyingGlassPlusIcon,
ArrowTopRightOnSquareIcon,
} from "@heroicons/react/24/outline";
import { FileLite } from "../types/file";
type FileProps = {
file: FileLite;
showScore?: boolean;
};
function File(props: FileProps) {
const [expanded, setExpanded] = useState(false);
const handleExpand = useCallback(() => {
setExpanded((prev) => !prev);
}, []);
return (
<div
className="border-gray-100 border rounded-md shadow p-2 cursor-pointer"
onClick={handleExpand}
>
<div className="flex flex-row justify-between">
<div className="flex hover:text-gray-600">{props.file.name}</div>
<div className="flex flex-row space-x-2">
{props.showScore && props.file.score && (
<div className="flex text-blue-600 mr-4">
{props.file.score.toFixed(2)}
</div>
)}
<div className="ml-auto w-max flex items-center justify-center">
{expanded ? (
<MagnifyingGlassMinusIcon className="text-gray-500 h-5" />
) : (
<MagnifyingGlassPlusIcon className="text-gray-500 h-5" />
)}
</div>
<a
href={props.file.url}
target="_blank"
rel="noopener noreferrer"
onClick={(e) => e.stopPropagation()} // prevent the click event from bubbling up to the list item
>
<ArrowTopRightOnSquareIcon className="text-gray-500 h-5" />
</a>
</div>
</div>
<Transition
show={expanded}
enter="transition duration-75 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-100 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="items-center mt-2 justify-center">
<iframe
src={props.file.url}
className="h-full w-full"
title={props.file.name}
></iframe>
</div>
</Transition>
</div>
);
}
export default memo(File);

@ -1,147 +0,0 @@
import React, { memo, useCallback, useRef, useState } from "react";
import { Transition } from "@headlessui/react";
import axios from "axios";
import ReactMarkdown from "react-markdown";
import FileViewerList from "./FileViewerList";
import LoadingText from "./LoadingText";
import { isFileNameInString } from "../services/utils";
import { FileChunk, FileLite } from "../types/file";
import { SERVER_ADDRESS } from "../types/constants";
type FileQandAAreaProps = {
files: FileLite[];
};
function FileQandAArea(props: FileQandAAreaProps) {
const searchBarRef = useRef(null);
const [answerError, setAnswerError] = useState("");
const [searchResultsLoading, setSearchResultsLoading] =
useState<boolean>(false);
const [answer, setAnswer] = useState("");
const handleSearch = useCallback(async () => {
if (searchResultsLoading) {
return;
}
const question = (searchBarRef?.current as any)?.value ?? "";
setAnswer("");
if (!question) {
setAnswerError("Please ask a question.");
return;
}
if (props.files.length === 0) {
setAnswerError("Please upload files before asking a question.");
return;
}
setSearchResultsLoading(true);
setAnswerError("");
let results: FileChunk[] = [];
try {
const answerResponse = await axios.post(
`${SERVER_ADDRESS}/answer_question`,
{
question,
}
);
if (answerResponse.status === 200) {
setAnswer(answerResponse.data.answer);
} else {
setAnswerError("Sorry, something went wrong!");
}
} catch (err: any) {
setAnswerError("Sorry, something went wrong!");
}
setSearchResultsLoading(false);
}, [props.files, searchResultsLoading]);
const handleEnterInSearchBar = useCallback(
async (event: React.SyntheticEvent) => {
if ((event as any).key === "Enter") {
await handleSearch();
}
},
[handleSearch]
);
return (
<div className="space-y-4 text-gray-800">
<div className="mt-2">
Ask a question based on the content of your files:
</div>
<div className="space-y-2">
<input
className="border rounded border-gray-200 w-full py-1 px-2"
placeholder="e.g. What were the key takeaways from the Q1 planning meeting?"
name="search"
ref={searchBarRef}
onKeyDown={handleEnterInSearchBar}
/>
<div
className="rounded-md bg-gray-50 py-1 px-4 w-max text-gray-500 hover:bg-gray-100 border border-gray-100 shadow cursor-pointer"
onClick={handleSearch}
>
{searchResultsLoading ? (
<LoadingText text="Answering question..." />
) : (
"Ask question"
)}
</div>
</div>
<div className="">
{answerError && <div className="text-red-500">{answerError}</div>}
<Transition
show={answer !== ""}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
{/* answer from files */}
{answer && (
<div className="">
<ReactMarkdown className="prose" linkTarget="_blank">
{answer}
</ReactMarkdown>
</div>
)}
<Transition
show={
props.files.filter((file) =>
isFileNameInString(file.name, answer)
).length > 0
}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
<FileViewerList
files={props.files.filter((file) =>
isFileNameInString(file.name, answer)
)}
title="Sources"
listExpanded={true}
/>
</Transition>
</Transition>
</div>
</div>
);
}
export default memo(FileQandAArea);

@ -1,195 +0,0 @@
import React, {
Dispatch,
SetStateAction,
useCallback,
useState,
memo,
useRef,
} from "react";
import axios from "axios";
import { ArrowUpTrayIcon } from "@heroicons/react/24/outline";
import { compact } from "lodash";
import LoadingText from "./LoadingText";
import { FileLite } from "../types/file";
import FileViewerList from "./FileViewerList";
import { SERVER_ADDRESS } from "../types/constants";
type FileUploadAreaProps = {
handleSetFiles: Dispatch<SetStateAction<FileLite[]>>;
maxNumFiles: number;
maxFileSizeMB: number;
};
function FileUploadArea(props: FileUploadAreaProps) {
const handleSetFiles = props.handleSetFiles;
const [files, setFiles] = useState<FileLite[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState("");
const [dragOver, setDragOver] = useState(false);
const dropzoneRef = useRef<HTMLLabelElement>(null);
const handleFileChange = useCallback(
async (selectedFiles: FileList | null) => {
if (selectedFiles && selectedFiles.length > 0) {
setError("");
if (files.length + selectedFiles.length > props.maxNumFiles) {
setError(`You can only upload up to ${props.maxNumFiles} files.`);
if (dropzoneRef.current) {
(dropzoneRef.current as any).value = "";
}
return;
}
setLoading(true);
const uploadedFiles = await Promise.all(
Array.from(selectedFiles).map(async (file) => {
// Check the file type
if (
file.type.match(
/(text\/plain|application\/(pdf|msword|vnd\.openxmlformats-officedocument\.wordprocessingml\.document))/
) && // AND file isnt too big
file.size < props.maxFileSizeMB * 1024 * 1024
) {
// Check if the file name already exists in the files state
if (files.find((f) => f.name === file.name)) {
return null; // skip this file
}
const formData = new FormData();
formData.append("file", file);
try {
const processFileResponse = await axios.post(
`${SERVER_ADDRESS}/process_file`,
formData,
{
headers: {
"Content-Type": "multipart/form-data",
},
}
);
if (
processFileResponse.status === 200 &&
processFileResponse.data.success
) {
const fileObject: FileLite = {
name: file.name,
url: URL.createObjectURL(file),
expanded: false,
};
console.log(fileObject);
return fileObject;
} else {
console.log("Error processing file");
return null;
}
} catch (err: any) {
console.log(`error processing file: ${err}`);
return null;
}
} else {
alert(
`Invalid file type or size. Only TXT, PD or DOCX are allowed, up to ${props.maxFileSizeMB}MB.`
);
return null; // Skip this file
}
})
);
// Filter out any null values from the uploadedFiles array
const validFiles = compact(uploadedFiles);
// Set the files state with the valid files and the existing files
setFiles((prevFiles) => [...prevFiles, ...validFiles]);
handleSetFiles((prevFiles) => [...prevFiles, ...validFiles]);
setLoading(false);
}
},
[files, handleSetFiles, props.maxFileSizeMB, props.maxNumFiles]
);
const handleDragEnter = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(true);
}, []);
const handleDragOver = useCallback((event: React.DragEvent) => {
event.preventDefault();
}, []);
const handleDragLeave = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
}, []);
const handleDrop = useCallback(
(event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
const droppedFiles = event.dataTransfer.files;
handleFileChange(droppedFiles);
},
[handleFileChange]
);
return (
<div className="flex items-center justify-center w-full flex-col">
<label
htmlFor="dropzone-file"
className={`flex flex-col shadow items-center justify-center w-full h-36 border-2 border-gray-300 border-dashed rounded-lg cursor-pointer bg-gray-50 hover:bg-gray-100 relative ${
dragOver ? "border-blue-500 bg-blue-50" : ""
}`}
ref={dropzoneRef}
onDragEnter={handleDragEnter}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDrop}
>
<div className="flex flex-col items-center justify-center pt-5 pb-6">
{loading ? (
<LoadingText text="Uploading..." />
) : (
<div className="text-gray-500 flex flex-col items-center text-center">
<ArrowUpTrayIcon className="w-7 h-7 mb-4" />
<p className="mb-2 text-sm">
<span className="font-semibold">Click to upload</span> or drag
and drop
</p>
<p className="text-xs">
PDF, DOCX or TXT (max {props.maxFileSizeMB}MB per file)
</p>
<p className="text-xs mt-1">
You can upload up to {props.maxNumFiles - files.length} more{" "}
{props.maxNumFiles - files.length === 1 ? "file" : "files"}
</p>
<input
id="dropzone-file"
type="file"
className="hidden"
multiple
onChange={(event) => handleFileChange(event.target.files)}
/>
</div>
)}
</div>
</label>
{error && (
<div className="flex items-center justify-center w-full mt-4">
<p className="text-sm text-red-500">{error}</p>
</div>
)}
<FileViewerList files={files} title="Uploaded Files" />
</div>
);
}
export default memo(FileUploadArea);

@ -1,73 +0,0 @@
import React, { memo, useCallback, useState } from "react";
import { ChevronUpIcon } from "@heroicons/react/24/outline";
import clsx from "clsx";
import { Transition } from "@headlessui/react";
import File from "./File";
import { FileLite } from "../types/file";
type FileViewerListProps = {
files: FileLite[];
title: string;
listExpanded?: boolean;
showScores?: boolean;
};
function FileViewerList(props: FileViewerListProps) {
const [listExpanded, setListExpanded] = useState(props.listExpanded ?? false);
const handleListExpand = useCallback(() => {
setListExpanded((prev) => !prev);
}, []);
return (
<div className="flex items-left justify-center w-full">
{props.files.length > 0 && (
<div className="flex flex-col items-left justify-center w-full mt-4">
<div className="flex flex-row">
<div
className="rounded-md flex shadow p-2 mb-2 w-full bg-gray-50 items-center cursor-pointer "
onClick={handleListExpand}
>
{props.title}
<div className="bg-gray-300 ml-2 px-2 rounded-full w-max text-center text-sm ">
{props.files.length}
</div>
</div>
<div className="ml-auto w-max flex items-center justify-center">
<ChevronUpIcon
className={clsx(
"w-6 h-6 ml-2 stroke-slate-400 transition-transform cursor-pointer",
!listExpanded && "-rotate-180"
)}
onClick={handleListExpand}
/>
</div>
</div>
<Transition
show={listExpanded}
enter="transition duration-125 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="text-sm text-gray-500 space-y-2">
{props.files.map((file) => (
<File
key={file.name}
file={file}
showScore={props.showScores}
/>
))}
</div>
</Transition>
</div>
)}
</div>
);
}
export default memo(FileViewerList);

@ -1,33 +0,0 @@
import clsx from "clsx";
type Props = {
className?: string;
size?: number;
};
export default function LoadingSpinner(props: Props) {
const size = props.size || 5;
return (
<div className={clsx("flex flex-row", props.className)}>
<svg
aria-hidden="true"
className={clsx(
"mr-2 text-gray-200 animate-spin dark:text-gray-600 fill-black stroke-1",
`w-${size} h-${size}`
)}
viewBox="0 0 100 101"
fill="none"
xmlns="http://www.w3.org/2000/svg"
>
<path
d="M100 50.5908C100 78.2051 77.6142 100.591 50 100.591C22.3858 100.591 0 78.2051 0 50.5908C0 22.9766 22.3858 0.59082 50 0.59082C77.6142 0.59082 100 22.9766 100 50.5908ZM9.08144 50.5908C9.08144 73.1895 27.4013 91.5094 50 91.5094C72.5987 91.5094 90.9186 73.1895 90.9186 50.5908C90.9186 27.9921 72.5987 9.67226 50 9.67226C27.4013 9.67226 9.08144 27.9921 9.08144 50.5908Z"
fill="currentColor"
/>
<path
d="M93.9676 39.0409C96.393 38.4038 97.8624 35.9116 97.0079 33.5539C95.2932 28.8227 92.871 24.3692 89.8167 20.348C85.8452 15.1192 80.8826 10.7238 75.2124 7.41289C69.5422 4.10194 63.2754 1.94025 56.7698 1.05124C51.7666 0.367541 46.6976 0.446843 41.7345 1.27873C39.2613 1.69328 37.813 4.19778 38.4501 6.62326C39.0873 9.04874 41.5694 10.4717 44.0505 10.1071C47.8511 9.54855 51.7191 9.52689 55.5402 10.0491C60.8642 10.7766 65.9928 12.5457 70.6331 15.2552C75.2735 17.9648 79.3347 21.5619 82.5849 25.841C84.9175 28.9121 86.7997 32.2913 88.1811 35.8758C89.083 38.2158 91.5421 39.6781 93.9676 39.0409Z"
fill="currentFill"
/>
</svg>
</div>
);
}

@ -1,18 +0,0 @@
import React, { memo } from "react";
import LoadingSpinner from "./LoadingSpinner";
type LoadingTextProps = {
text: string;
};
function LoadingText(props: LoadingTextProps) {
return (
<div className="text-gray-500 text-md flex flex-row justify-center items-center">
<LoadingSpinner />
{props.text && <div className="flex">{props.text}</div>}
</div>
);
}
export default memo(LoadingText);

@ -1,6 +0,0 @@
import "@/styles/globals.css";
import type { AppProps } from "next/app";
export default function App({ Component, pageProps }: AppProps) {
return <Component {...pageProps} />;
}

@ -1,13 +0,0 @@
import { Html, Head, Main, NextScript } from "next/document";
export default function Document() {
return (
<Html lang="en">
<Head />
<body>
<Main />
<NextScript />
</body>
</Html>
);
}

@ -1,35 +0,0 @@
import Head from "next/head";
import { useState } from "react";
import FileQandAArea from "../components/FileQandAArea";
import { FileLite } from "../types/file";
import FileUploadArea from "../components/FileUploadArea";
export default function FileQandA() {
const [files, setFiles] = useState<FileLite[]>([]);
return (
<div className="flex items-left text-left h-screen flex-col">
<Head>
<title>File Q&A</title>
</Head>
<div className="max-w-3xl mx-auto m-8 space-y-8 text-gray-800">
<h1 className="text-4xl">File Q&A</h1>
<div className="">
To search for answers from the content in your files, upload them here
and we will use OpenAI embeddings and GPT to find answers from the
relevant documents.
</div>
<FileUploadArea
handleSetFiles={setFiles}
maxNumFiles={75}
maxFileSizeMB={30}
/>
<FileQandAArea files={files} />
</div>
</div>
);
}

@ -1,19 +0,0 @@
// A function that takes a file name and a string and returns true if the file name is contained in the string
// after removing punctuation and whitespace from both
export const isFileNameInString = (fileName: string, str: string) => {
// Check if the input string is null or undefined
if (!str) {
return false;
}
// Convert both to lowercase and remove punctuation and whitespace
const normalizedFileName = fileName
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
const normalizedStr = str
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
// Return true if the normalized file name is included in the normalized string
return normalizedStr.includes(normalizedFileName);
};

@ -1,5 +0,0 @@
@import "./preflight.css";
@tailwind base;
@tailwind components;
@tailwind utilities;

@ -1,368 +0,0 @@
/* Using a custom preflight to fix conflicts with Ant Design */
/* Original: https://unpkg.com/tailwindcss@3.2.4/src/css/preflight.css */
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box; /* 1 */
border-width: 0; /* 2 */
border-style: solid; /* 2 */
border-color: theme("borderColor.DEFAULT"); /* 2 */
}
::before,
::after {
--tw-content: "";
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
5. Use the user's configured `sans` font-feature-settings by default.
*/
html {
line-height: 1.5; /* 1 */
-webkit-text-size-adjust: 100%; /* 2 */
-moz-tab-size: 4; /* 3 */
tab-size: 4; /* 3 */
font-family: theme("fontFamily.sans"); /* 4 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0; /* 1 */
line-height: inherit; /* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0; /* 1 */
color: inherit; /* 2 */
border-top-width: 1px; /* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: theme("fontFamily.mono"); /* 1 */
font-size: 1em; /* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0; /* 1 */
border-color: inherit; /* 2 */
border-collapse: collapse; /* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit; /* 1 */
font-size: 100%; /* 1 */
font-weight: inherit; /* 1 */
line-height: inherit; /* 1 */
color: inherit; /* 1 */
margin: 0; /* 2 */
padding: 0; /* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type="button"],
[type="reset"],
[type="submit"] {
-webkit-appearance: button; /* 1 */
background-image: none; /* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type="search"] {
-webkit-appearance: textfield; /* 1 */
outline-offset: -2px; /* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button; /* 1 */
font: inherit; /* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::placeholder,
textarea::placeholder {
opacity: 1; /* 1 */
color: theme("colors.gray.400"); /* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block; /* 1 */
vertical-align: middle; /* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/* Make elements with the HTML hidden attribute stay hidden by default */
[hidden] {
display: none;
}

@ -1 +0,0 @@
export const SERVER_ADDRESS = "http://localhost:8080";

@ -1,21 +0,0 @@
export interface FileLite {
expanded?: boolean;
name: string;
url?: string;
type?: string;
score?: number;
size?: number;
embedding?: number[]; // The file embedding -- or mean embedding if there are multiple embeddings for the file
chunks?: TextEmbedding[]; // The chunks of text and their embeddings
extractedText?: string; // The extracted text from the file
}
export interface FileChunk extends TextEmbedding {
filename: string;
score?: number;
}
export interface TextEmbedding {
text: string;
embedding: number[];
}

@ -1,28 +0,0 @@
const { fontFamily } = require("tailwindcss/defaultTheme");
/** @type {import('tailwindcss').Config} */
module.exports = {
content: [
"./app/**/*.{js,ts,jsx,tsx}",
"./src/**/*.{js,ts,jsx,tsx}",
"./pages/**/*.{js,ts,jsx,tsx}",
"./components/**/*.{js,ts,jsx,tsx}",
],
corePlugins: {
preflight: false,
},
theme: {
extend: {
},
},
keyframes: {
blink: {
"0%, 100%": { opacity: 1 },
"50%": { opacity: 0 },
},
},
plugins: [
require("@tailwindcss/line-clamp"),
require("@tailwindcss/typography"),
],
};

@ -1,24 +0,0 @@
{
"compilerOptions": {
"target": "es5",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
"exclude": ["node_modules"]
}

@ -1,80 +0,0 @@
from utils import get_embedding
from flask import jsonify
from config import *
from flask import current_app
import openai
from config import *
TOP_K = 10
def get_answer_from_files(question, session_id, pinecone_index):
logging.info(f"Getting answer for question: {question}")
search_query_embedding = get_embedding(question, EMBEDDINGS_MODEL)
try:
query_response = pinecone_index.query(
namespace=session_id,
top_k=TOP_K,
include_values=False,
include_metadata=True,
vector=search_query_embedding,
)
logging.info(
f"[get_answer_from_files] received query response from Pinecone: {query_response}")
files_string = ""
file_text_dict = current_app.config["file_text_dict"]
for i in range(len(query_response.matches)):
result = query_response.matches[i]
file_chunk_id = result.id
score = result.score
filename = result.metadata["filename"]
file_text = file_text_dict.get(file_chunk_id)
file_string = f"###\n\"{filename}\"\n{file_text}\n"
if score < COSINE_SIM_THRESHOLD and i > 0:
logging.info(
f"[get_answer_from_files] score {score} is below threshold {COSINE_SIM_THRESHOLD} and i is {i}, breaking")
break
files_string += file_string
# Note: this is not the proper way to use the ChatGPT conversational format, but it works for now
messages = [
{
"role": "system",
"content": f"Given a question, try to answer it using the content of the file extracts below, and if you cannot answer, or find " \
f"a relevant file, just output \"I couldn't find the answer to that question in your files.\".\n\n" \
f"If the answer is not contained in the files or if there are no file extracts, respond with \"I couldn't find the answer " \
f"to that question in your files.\" If the question is not actually a question, respond with \"That's not a valid question.\"\n\n" \
f"In the cases where you can find the answer, first give the answer. Then explain how you found the answer from the source or sources, " \
f"and use the exact filenames of the source files you mention. Do not make up the names of any other files other than those mentioned "\
f"in the files context. Give the answer in markdown format." \
f"Use the following format:\n\nQuestion: <question>\n\nFiles:\n<###\n\"filename 1\"\nfile text>\n<###\n\"filename 2\"\nfile text>...\n\n"\
f"Answer: <answer or \"I couldn't find the answer to that question in your files\" or \"That's not a valid question.\">\n\n" \
f"Question: {question}\n\n" \
f"Files:\n{files_string}\n" \
f"Answer:"
},
]
response = openai.ChatCompletion.create(
messages=messages,
model=GENERATIVE_MODEL,
max_tokens=1000,
temperature=0,
)
choices = response["choices"] # type: ignore
answer = choices[0].message.content.strip()
logging.info(f"[get_answer_from_files] answer: {answer}")
return jsonify({"answer": answer})
except Exception as e:
logging.info(f"[get_answer_from_files] error: {e}")
return str(e)

@ -1,100 +0,0 @@
from __future__ import print_function
from config import *
import tiktoken
import pinecone
import uuid
import sys
import logging
from flask import Flask, jsonify
from flask_cors import CORS, cross_origin
from flask import request
from handle_file import handle_file
from answer_question import get_answer_from_files
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
def load_pinecone_index() -> pinecone.Index:
"""
Load index from Pinecone, raise error if the index can't be found.
"""
pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_ENV,
)
index_name = PINECONE_INDEX
if not index_name in pinecone.list_indexes():
print(pinecone.list_indexes())
raise KeyError(f"Index '{index_name}' does not exist.")
index = pinecone.Index(index_name)
return index
def create_app():
pinecone_index = load_pinecone_index()
tokenizer = tiktoken.get_encoding("gpt2")
session_id = str(uuid.uuid4().hex)
app = Flask(__name__)
app.pinecone_index = pinecone_index
app.tokenizer = tokenizer
app.session_id = session_id
# log session id
logging.info(f"session_id: {session_id}")
app.config["file_text_dict"] = {}
CORS(app, supports_credentials=True)
return app
app = create_app()
@app.route(f"/process_file", methods=["POST"])
@cross_origin(supports_credentials=True)
def process_file():
try:
file = request.files['file']
logging.info(str(file))
handle_file(
file, app.session_id, app.pinecone_index, app.tokenizer)
return jsonify({"success": True})
except Exception as e:
logging.error(str(e))
return jsonify({"success": False})
@app.route(f"/answer_question", methods=["POST"])
@cross_origin(supports_credentials=True)
def answer_question():
try:
params = request.get_json()
question = params["question"]
answer_question_response = get_answer_from_files(
question, app.session_id, app.pinecone_index)
return answer_question_response
except Exception as e:
return str(e)
@app.route("/healthcheck", methods=["GET"])
@cross_origin(supports_credentials=True)
def healthcheck():
return "OK"
if __name__ == "__main__":
app.run(debug=True, port=SERVER_PORT, threaded=True)

@ -1,35 +0,0 @@
from pathlib import Path
import logging
import sys
from pprint import pformat
import yaml
# Load config items from config.yaml.
# Use Path.resolve() to get the absolute path of the parent directory
yaml_dir = Path(__file__).resolve().parent
yaml_path = yaml_dir / "config.yaml" # Use Path / operator to join paths
def load_yaml_config(path):
"""Load a yaml file and return a dictionary of its contents."""
try:
with open(path, "r") as stream:
return yaml.safe_load(stream)
except yaml.YAMLError as exc:
logging.error(f"Failed to load {path}: {exc}")
return None
# Load the config and update the global variables
yaml_config = load_yaml_config(yaml_path)
if yaml_config is not None:
logging.info(f"Loaded config from {yaml_path}:")
logging.info(pformat(yaml_config))
globals().update(yaml_config)
else:
logging.error(f"Could not load config from {yaml_path}.")
sys.exit(1) # Exit the program if the config is invalid
# Set a default value for SERVER_PORT if not specified in the config
SERVER_PORT = yaml_config.get("SERVER_PORT", None)
# Use Path.resolve() to get the absolute path of the current directory
SERVER_DIR = Path(__file__).resolve().parent

@ -1,18 +0,0 @@
# ----- PINECONE CONFIG -----
PINECONE_API_KEY: "<your Pinecone API key>"
PINECONE_INDEX: "<your Pinecone Index name>" # dimensions: 1536, metric: cosine similarity
PINECONE_ENV: "<your Pinecone env e.g.us-west1-gcp>"
# ----- SERVER PORT ----
SERVER_PORT: "8080"
# ---- OPENAI CONFIG -----
EMBEDDINGS_MODEL: "text-embedding-ada-002"
GENERATIVE_MODEL: "gpt-3.5-turbo" # use gpt-4 for better results
EMBEDDING_DIMENSIONS: 1536
TEXT_EMBEDDING_CHUNK_SIZE: 200
# This is the minimum cosine similarity score that a file must have with the search query to be considered relevant
# This is an arbitrary value, and you should vary/ remove this depending on the diversity of your dataset
COSINE_SIM_THRESHOLD: 0.7
MAX_TEXTS_TO_EMBED_BATCH_SIZE: 100
MAX_PINECONE_VECTORS_TO_UPSERT_PATCH_SIZE: 100

@ -1,168 +0,0 @@
import logging
import sys
import docx2txt
from PyPDF2 import PdfReader
from numpy import array, average
from flask import current_app
from config import *
from utils import get_embeddings, get_pinecone_id_for_file_chunk
# Set up logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
# Handle a file by extracting its text, creating embeddings, and upserting them to Pinecone
def handle_file(file, session_id, pinecone_index, tokenizer):
"""Handle a file by extracting its text, creating embeddings, and upserting them to Pinecone."""
filename = file.filename
logging.info("[handle_file] Handling file: {}".format(filename))
# Get the file text dict from the current app config
file_text_dict = current_app.config["file_text_dict"]
# Extract text from the file
try:
extracted_text = extract_text_from_file(file)
except ValueError as e:
logging.error(
"[handle_file] Error extracting text from file: {}".format(e))
raise e
# Save extracted text to file text dict
file_text_dict[filename] = extracted_text
# Handle the extracted text as a string
return handle_file_string(filename, session_id, extracted_text, pinecone_index, tokenizer, file_text_dict)
# Extract text from a file based on its mimetype
def extract_text_from_file(file):
"""Return the text content of a file."""
if file.mimetype == "application/pdf":
# Extract text from pdf using PyPDF2
reader = PdfReader(file)
extracted_text = ""
for page in reader.pages:
extracted_text += page.extract_text()
elif file.mimetype == "text/plain":
# Read text from plain text file
extracted_text = file.read().decode("utf-8")
file.close()
elif file.mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
# Extract text from docx using docx2txt
extracted_text = docx2txt.process(file)
else:
# Unsupported file type
raise ValueError("Unsupported file type: {}".format(file.mimetype))
return extracted_text
# Handle a file string by creating embeddings and upserting them to Pinecone
def handle_file_string(filename, session_id, file_body_string, pinecone_index, tokenizer, file_text_dict):
"""Handle a file string by creating embeddings and upserting them to Pinecone."""
logging.info("[handle_file_string] Starting...")
# Clean up the file string by replacing newlines and double spaces
clean_file_body_string = file_body_string.replace(
"\n", "; ").replace(" ", " ")
# Add the filename to the text to embed
text_to_embed = "Filename is: {}; {}".format(
filename, clean_file_body_string)
# Create embeddings for the text
try:
text_embeddings, average_embedding = create_embeddings_for_text(
text_to_embed, tokenizer)
logging.info(
"[handle_file_string] Created embedding for {}".format(filename))
except Exception as e:
logging.error(
"[handle_file_string] Error creating embedding: {}".format(e))
raise e
# Get the vectors array of triples: file_chunk_id, embedding, metadata for each embedding
# Metadata is a dict with keys: filename, file_chunk_index
vectors = []
for i, (text_chunk, embedding) in enumerate(text_embeddings):
id = get_pinecone_id_for_file_chunk(session_id, filename, i)
file_text_dict[id] = text_chunk
vectors.append(
(id, embedding, {"filename": filename, "file_chunk_index": i}))
logging.info(
"[handle_file_string] Text chunk {}: {}".format(i, text_chunk))
# Split the vectors array into smaller batches of max length 2000
batch_size = MAX_PINECONE_VECTORS_TO_UPSERT_PATCH_SIZE
batches = [vectors[i:i+batch_size] for i in range(0, len(vectors), batch_size)]
# Upsert each batch to Pinecone
for batch in batches:
try:
pinecone_index.upsert(
vectors=batch, namespace=session_id)
logging.info(
"[handle_file_string] Upserted batch of embeddings for {}".format(filename))
except Exception as e:
logging.error(
"[handle_file_string] Error upserting batch of embeddings to Pinecone: {}".format(e))
raise e
# Compute the column-wise average of a list of lists
def get_col_average_from_list_of_lists(list_of_lists):
"""Return the average of each column in a list of lists."""
if len(list_of_lists) == 1:
return list_of_lists[0]
else:
list_of_lists_array = array(list_of_lists)
average_embedding = average(list_of_lists_array, axis=0)
return average_embedding.tolist()
# Create embeddings for a text using a tokenizer and an OpenAI engine
def create_embeddings_for_text(text, tokenizer):
"""Return a list of tuples (text_chunk, embedding) and an average embedding for a text."""
token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE, tokenizer))
text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]
# Split text_chunks into shorter arrays of max length 10
text_chunks_arrays = [text_chunks[i:i+MAX_TEXTS_TO_EMBED_BATCH_SIZE] for i in range(0, len(text_chunks), MAX_TEXTS_TO_EMBED_BATCH_SIZE)]
# Call get_embeddings for each shorter array and combine the results
embeddings = []
for text_chunks_array in text_chunks_arrays:
embeddings_response = get_embeddings(text_chunks_array, EMBEDDINGS_MODEL)
embeddings.extend([embedding["embedding"] for embedding in embeddings_response])
text_embeddings = list(zip(text_chunks, embeddings))
average_embedding = get_col_average_from_list_of_lists(embeddings)
return (text_embeddings, average_embedding)
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def chunks(text, n, tokenizer):
tokens = tokenizer.encode(text)
"""Yield successive n-sized chunks from text."""
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j

@ -1,11 +0,0 @@
Flask-Cors>=3.0.10
openai>=0.27.2
pinecone-client>=2.0.13
PyPDF2>=2.10.4
numpy>=1.23.2
scikit-learn>=1.1.2
docx2txt>=0.8
flask>=1.1.4
jinja2>=3.0.1
PyYAML>=6.0
tiktoken>=0.1.2

@ -1,10 +0,0 @@
#!/bin/bash
set -e
echo "Starting Python server..."
pip install virtualenv
python -m virtualenv venv
source venv/bin/activate
pip install -r requirements.txt
OPENAI_API_KEY=$1 python app.py

@ -1,38 +0,0 @@
import openai
import logging
import sys
import time
from config import *
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
def get_pinecone_id_for_file_chunk(session_id, filename, chunk_index):
return str(session_id+"-!"+filename+"-!"+str(chunk_index))
def get_embedding(text, engine):
return openai.Engine(id=engine).embeddings(input=[text])["data"][0]["embedding"]
def get_embeddings(text_array, engine):
# Parameters for exponential backoff
max_retries = 5 # Maximum number of retries
base_delay = 1 # Base delay in seconds
factor = 2 # Factor to multiply the delay by after each retry
while True:
try:
return openai.Engine(id=engine).embeddings(input=text_array)["data"]
except Exception as e:
if max_retries > 0:
logging.info(f"Request failed. Retrying in {base_delay} seconds.")
time.sleep(base_delay)
max_retries -= 1
base_delay *= factor
else:
raise e

@ -1,2 +0,0 @@
ARG VARIANT=16-bullseye
FROM mcr.microsoft.com/vscode/devcontainers/typescript-node:${VARIANT}

@ -1,5 +0,0 @@
{
"name": "openai-cookbook file-q-and-a",
"dockerFile": "Dockerfile",
"forwardPorts": [3000]
}

@ -1,4 +0,0 @@
# create a copy of this file named .env.local
# Your own API key for OpenAI
OPENAI_API_KEY='sk-......'

@ -1,3 +0,0 @@
{
"extends": "next/core-web-vitals"
}

@ -1,36 +0,0 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# local env files
.env*.local
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts

@ -1,41 +0,0 @@
# File Q&A
File Q&A is a [Next.js](https://nextjs.org/) app that lets you find answers in your files using OpenAI APIs. You can upload files and ask questions related to their content, and the app will use embeddings and GPT to generate answers from the most relevant files.
## Requirements
To run the app, you need an OpenAI API key. You can create a new API key [here](https://beta.openai.com/account/api-keys).
## Set Up
If you don't have Node.js and npm already, install them from [https://nodejs.org/en/download/](https://nodejs.org/en/download/).
Otherwise you can run the project inside a [Dev Container](https://code.visualstudio.com/docs/devcontainers/containers). If you are using VS Code, open the Command Pallette (ctrl+shift+p on Windows / cmd+shift+p on Mac), and run the command `Dev Containers: Open Folder in Container`. Then select the folder `apps/file-q-and-a/nextjs`, and VS Code will take care of booting a container with Node.js and npm ready to go. You may need to install the [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension.
In your terminal, navigate to the `nextjs` directory of this example app, and then install dependencies:
```
npm install
```
Copy the .env.local.example file into a .env.local file and fill out the OpenAI API key field.
## Development
Run the development server:
```
npm run dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the app.
## Deployment
You can deploy the app on [Vercel](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme), the platform from the creators of Next.js. Check out the [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details.
## Limitations
Uploaded files and generated embeddings don't persist on browser refresh. If you want to store more embeddings, we recommend using a vector database (e.g. Pinecone, Weaviate, Milvus, Qdrant, Redis, FAISS, etc.). The `nextjs-with-flask-server` version of this demo uses a Pinecone vector database.
The app may sometimes generate answers that are not in the files, or hallucinate about the existence of files that are not uploaded.

@ -1,6 +0,0 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
}
module.exports = nextConfig

File diff suppressed because it is too large Load Diff

@ -1,44 +0,0 @@
{
"name": "file-q-and-a",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@headlessui/react": "^1.7.7",
"@heroicons/react": "^2.0.13",
"@next/font": "13.1.2",
"@tailwindcss/line-clamp": "^0.4.2",
"@tailwindcss/typography": "^0.5.9",
"@types/formidable": "^2.0.5",
"@types/lodash": "^4.14.191",
"@types/node": "18.11.18",
"@types/pdf-parse": "^1.1.1",
"@types/react": "18.0.27",
"@types/react-dom": "18.0.10",
"axios": "^1.2.3",
"clsx": "^1.2.1",
"eslint": "8.32.0",
"eslint-config-next": "13.1.2",
"formidable": "^2.1.1",
"lodash": "^4.17.21",
"mammoth": "^1.5.1",
"next": "13.1.2",
"node-html-markdown": "^1.3.0",
"openai": "^3.2.1",
"pdf-parse": "^1.1.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-markdown": "^8.0.5",
"typescript": "4.9.4"
},
"devDependencies": {
"autoprefixer": "^10.4.13",
"postcss": "^8.4.31",
"tailwindcss": "^3.2.4"
}
}

@ -1,6 +0,0 @@
module.exports = {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};

Binary file not shown.

Before

Width:  |  Height:  |  Size: 262 KiB

@ -1 +0,0 @@
<svg id="openai-horizontal" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 120 29.53"><path d="M40.7,6.98s-.05,0-.07,0c-.02,0-.05,0-.07,0-4.67,0-7.58,2.91-7.58,7.6v2.53c0,4.69,2.9,7.6,7.58,7.6,.02,0,.05,0,.07,0,.02,0,.05,0,.07,0,4.67,0,7.58-2.91,7.58-7.6v-2.53c0-4.69-2.91-7.6-7.58-7.6Zm4.31,10.31c0,3.08-1.6,4.86-4.38,4.89-2.78-.03-4.38-1.81-4.38-4.89v-2.88c0-3.08,1.6-4.86,4.38-4.89,2.78,.03,4.38,1.81,4.38,4.89v2.88Zm40.57-5.79s-.06,0-.09,0c-.02,0-.03,0-.05,0-1.77,0-3.03,.6-3.65,1.75l-.19,.35v-1.8h-3.02v12.56h3.17v-7.48c0-1.76,.95-2.77,2.59-2.8,1.57,.03,2.47,1.02,2.47,2.73v7.55h3.17v-8.09c0-2.99-1.64-4.77-4.39-4.77Zm34.42-1.77v-2.4h-10.46v2.4h3.67v12.22h-3.67v2.4h10.46v-2.4h-3.67V9.73h3.67Zm-18.75-2.4h0s-3.28,0-3.28,0l-6.1,17.04h3.43l1.17-3.65h6.66v.04s1.17,3.62,1.17,3.62h3.43l-6.11-17.04h-.36Zm-4.03,10.98l2.57-8.05,2.55,8.05h-5.12Zm-39.45-6.81s-.05,0-.07,0c-.03,0-.05,0-.07,0-1.59,0-2.96,.66-3.68,1.76l-.18,.28v-1.74h-3.02V28.69h3.17v-5.9l.18,.27c.68,1.01,2.01,1.61,3.56,1.61,.03,0,.05,0,.08,0,.02,0,.04,0,.07,0,2.61,0,5.24-1.7,5.24-5.51v-2.14c0-2.74-1.62-5.51-5.26-5.51Zm2.1,7.5c0,2-1.15,3.24-3.01,3.28-1.73-.03-2.94-1.35-2.94-3.23v-1.89c0-1.9,1.22-3.24,2.97-3.28,1.84,.03,2.98,1.28,2.98,3.28v1.84Zm11.05-7.5h0c-.06,0-.12,.01-.18,.01-.06,0-.12-.01-.18-.01h0c-3.57,0-5.78,2.23-5.78,5.81v1.76c0,3.45,2.24,5.59,5.83,5.59,.08,0,.15,0,.22-.01,.05,0,.09,.01,.14,.01,2.41,0,4.09-.88,5.16-2.7l-2.13-1.23c-.71,1.05-1.66,1.84-3.02,1.84-1.82,0-2.91-1.12-2.91-3.01v-.5h8.44v-2.08c0-3.34-2.19-5.49-5.59-5.49Zm-2.86,5.54v-.3c0-2,.95-3.12,2.68-3.2,1.66,.08,2.66,1.18,2.66,2.99v.5s-5.34,0-5.34,0Z"></path><path d="M27.21,12.08c.67-2.01,.44-4.21-.63-6.04-1.61-2.8-4.85-4.24-8.01-3.57C17.16,.89,15.14-.01,13.02,0c-3.23,0-6.1,2.08-7.1,5.15-2.08,.43-3.87,1.73-4.92,3.57-1.62,2.8-1.25,6.32,.92,8.72-.67,2.01-.44,4.21,.63,6.03,1.61,2.81,4.85,4.25,8.02,3.58,1.4,1.58,3.42,2.49,5.54,2.48,3.23,0,6.1-2.08,7.1-5.15,2.08-.43,3.87-1.73,4.91-3.57,1.63-2.8,1.26-6.32-.91-8.72Zm-2.3-5.07c.64,1.12,.88,2.43,.66,3.7-.04-.03-.12-.07-.17-.1l-5.88-3.4c-.3-.17-.67-.17-.97,0l-6.89,3.98v-2.92l5.69-3.29c2.65-1.53,6.03-.62,7.56,2.03Zm-13.25,6.07l2.9-1.68,2.9,1.68v3.35l-2.9,1.68-2.9-1.68v-3.35ZM13.01,1.93c1.3,0,2.55,.45,3.55,1.28-.04,.02-.12,.07-.18,.1l-5.88,3.39c-.3,.17-.48,.49-.48,.84v7.96l-2.53-1.46V7.46c0-3.06,2.47-5.53,5.53-5.54ZM2.68,9.69h0c.65-1.12,1.66-1.98,2.88-2.43v6.99c0,.35,.18,.66,.48,.84l6.88,3.97-2.54,1.47-5.68-3.28c-2.64-1.53-3.55-4.91-2.02-7.56Zm1.55,12.83h0c-.65-1.11-.88-2.43-.66-3.7,.04,.03,.12,.07,.17,.1l5.88,3.4c.3,.17,.67,.17,.97,0l6.88-3.98v2.92l-5.69,3.28c-2.65,1.52-6.03,.62-7.56-2.02Zm11.89,5.08c-1.29,0-2.55-.45-3.54-1.28,.04-.02,.13-.07,.18-.1l5.88-3.39c.3-.17,.49-.49,.48-.84v-7.95l2.53,1.46v6.57c0,3.06-2.48,5.54-5.53,5.54Zm10.34-7.76c-.65,1.12-1.67,1.98-2.88,2.42v-6.99c0-.35-.18-.67-.48-.84h0l-6.89-3.98,2.53-1.46,5.69,3.28c2.65,1.53,3.55,4.91,2.02,7.56Z"></path></svg>

Before

Width:  |  Height:  |  Size: 2.8 KiB

@ -1 +0,0 @@
<svg xmlns="http://www.w3.org/2000/svg" id="openai-symbol" viewBox="0 0 32 32"><path d="M29.71,13.09A8.09,8.09,0,0,0,20.34,2.68a8.08,8.08,0,0,0-13.7,2.9A8.08,8.08,0,0,0,2.3,18.9,8,8,0,0,0,3,25.45a8.08,8.08,0,0,0,8.69,3.87,8,8,0,0,0,6,2.68,8.09,8.09,0,0,0,7.7-5.61,8,8,0,0,0,5.33-3.86A8.09,8.09,0,0,0,29.71,13.09Zm-12,16.82a6,6,0,0,1-3.84-1.39l.19-.11,6.37-3.68a1,1,0,0,0,.53-.91v-9l2.69,1.56a.08.08,0,0,1,.05.07v7.44A6,6,0,0,1,17.68,29.91ZM4.8,24.41a6,6,0,0,1-.71-4l.19.11,6.37,3.68a1,1,0,0,0,1,0l7.79-4.49V22.8a.09.09,0,0,1,0,.08L13,26.6A6,6,0,0,1,4.8,24.41ZM3.12,10.53A6,6,0,0,1,6.28,7.9v7.57a1,1,0,0,0,.51.9l7.75,4.47L11.85,22.4a.14.14,0,0,1-.09,0L5.32,18.68a6,6,0,0,1-2.2-8.18Zm22.13,5.14-7.78-4.52L20.16,9.6a.08.08,0,0,1,.09,0l6.44,3.72a6,6,0,0,1-.9,10.81V16.56A1.06,1.06,0,0,0,25.25,15.67Zm2.68-4-.19-.12-6.36-3.7a1,1,0,0,0-1.05,0l-7.78,4.49V9.2a.09.09,0,0,1,0-.09L19,5.4a6,6,0,0,1,8.91,6.21ZM11.08,17.15,8.38,15.6a.14.14,0,0,1-.05-.08V8.1a6,6,0,0,1,9.84-4.61L18,3.6,11.61,7.28a1,1,0,0,0-.53.91ZM12.54,14,16,12l3.47,2v4L16,20l-3.47-2Z"/></svg>

Before

Width:  |  Height:  |  Size: 1.0 KiB

@ -1,77 +0,0 @@
import { useState, useCallback, memo } from "react";
import { Transition } from "@headlessui/react";
import {
MagnifyingGlassMinusIcon,
MagnifyingGlassPlusIcon,
ArrowTopRightOnSquareIcon,
} from "@heroicons/react/24/outline";
import { FileLite } from "../types/file";
type FileProps = {
file: FileLite;
showScore?: boolean;
};
function File(props: FileProps) {
const [expanded, setExpanded] = useState(false);
const handleExpand = useCallback(() => {
setExpanded((prev) => !prev);
}, []);
return (
<div
className="border-gray-100 border rounded-md shadow p-2 cursor-pointer"
onClick={handleExpand}
>
<div className="flex flex-row justify-between">
<div className="flex hover:text-gray-600">{props.file.name}</div>
<div className="flex flex-row space-x-2">
{props.showScore && props.file.score && (
<div className="flex text-blue-600 mr-4">
{props.file.score.toFixed(2)}
</div>
)}
<div className="ml-auto w-max flex items-center justify-center">
{expanded ? (
<MagnifyingGlassMinusIcon className="text-gray-500 h-5" />
) : (
<MagnifyingGlassPlusIcon className="text-gray-500 h-5" />
)}
</div>
<a
href={props.file.url}
target="_blank"
rel="noopener noreferrer"
onClick={(e) => e.stopPropagation()} // prevent the click event from bubbling up to the list item
>
<ArrowTopRightOnSquareIcon className="text-gray-500 h-5" />
</a>
</div>
</div>
<Transition
show={expanded}
enter="transition duration-75 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-100 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="items-center mt-2 justify-center">
<iframe
src={props.file.url}
className="h-full w-full"
title={props.file.name}
></iframe>
</div>
</Transition>
</div>
);
}
export default memo(File);

@ -1,179 +0,0 @@
import React, { memo, useCallback, useRef, useState } from "react";
import { Transition } from "@headlessui/react";
import axios from "axios";
import ReactMarkdown from "react-markdown";
import FileViewerList from "./FileViewerList";
import LoadingText from "./LoadingText";
import { isFileNameInString } from "../services/utils";
import { FileChunk, FileLite } from "../types/file";
type FileQandAAreaProps = {
files: FileLite[];
};
function FileQandAArea(props: FileQandAAreaProps) {
const questionRef = useRef(null);
const [hasAskedQuestion, setHasAskedQuestion] = useState(false);
const [answerError, setAnswerError] = useState("");
const [answerLoading, setAnswerLoading] = useState<boolean>(false);
const [answer, setAnswer] = useState("");
const [answerDone, setAnswerDone] = useState(false);
const handleSearch = useCallback(async () => {
if (answerLoading) {
return;
}
const question = (questionRef?.current as any)?.value ?? "";
setAnswer("");
setAnswerDone(false);
if (!question) {
setAnswerError("Please ask a question.");
return;
}
if (props.files.length === 0) {
setAnswerError("Please upload files before asking a question.");
return;
}
setAnswerLoading(true);
setAnswerError("");
let results: FileChunk[] = [];
try {
const searchResultsResponse = await axios.post(
"/api/search-file-chunks",
{
searchQuery: question,
files: props.files,
maxResults: 10,
}
);
if (searchResultsResponse.status === 200) {
results = searchResultsResponse.data.searchResults;
} else {
setAnswerError("Sorry, something went wrong!");
}
} catch (err: any) {
setAnswerError("Sorry, something went wrong!");
}
setHasAskedQuestion(true);
const res = await fetch("/api/get-answer-from-files", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
question,
fileChunks: results,
}),
});
if (res.status === 500) {
setAnswerError("Internal server error. Please try again later.");
setAnswerLoading(false);
return;
}
const reader = res.body!.getReader();
while (true) {
const { done, value } = await reader.read();
if (done) {
setAnswerDone(true);
break;
}
setAnswer((prev) => prev + new TextDecoder().decode(value));
}
setAnswerLoading(false);
}, [props.files, answerLoading]);
const handleEnterInSearchBar = useCallback(
async (event: React.SyntheticEvent) => {
if ((event as any).key === "Enter") {
await handleSearch();
}
},
[handleSearch]
);
return (
<div className="space-y-4 text-gray-800">
<div className="mt-2">
Ask a question based on the content of your files:
</div>
<div className="space-y-2">
<input
className="border rounded border-gray-200 w-full py-1 px-2"
placeholder="e.g. What were the key takeaways from the Q1 planning meeting?"
name="search"
ref={questionRef}
onKeyDown={handleEnterInSearchBar}
/>
<div
className="rounded-md bg-gray-50 py-1 px-4 w-max text-gray-500 hover:bg-gray-100 border border-gray-100 shadow cursor-pointer"
onClick={handleSearch}
>
{answerLoading ? (
<LoadingText text="Answering question..." />
) : (
"Ask question"
)}
</div>
</div>
<div className="">
{answerError && <div className="text-red-500">{answerError}</div>}
<Transition
show={hasAskedQuestion}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
{answer && (
<div className="">
<ReactMarkdown className="prose" linkTarget="_blank">
{`${answer}${answerDone ? "" : " |"}`}
</ReactMarkdown>
</div>
)}
<Transition
show={
props.files.filter((file) =>
isFileNameInString(file.name, answer)
).length > 0
}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
<FileViewerList
files={props.files.filter((file) =>
isFileNameInString(file.name, answer)
)}
title="Sources"
listExpanded={true}
/>
</Transition>
</Transition>
</div>
</div>
);
}
export default memo(FileQandAArea);

@ -1,201 +0,0 @@
import React, {
Dispatch,
SetStateAction,
useCallback,
useState,
memo,
useRef,
} from "react";
import axios from "axios";
import { ArrowUpTrayIcon } from "@heroicons/react/24/outline";
import { compact } from "lodash";
import LoadingText from "./LoadingText";
import { FileLite } from "../types/file";
import FileViewerList from "./FileViewerList";
type FileUploadAreaProps = {
handleSetFiles: Dispatch<SetStateAction<FileLite[]>>;
maxNumFiles: number;
maxFileSizeMB: number;
};
function FileUploadArea(props: FileUploadAreaProps) {
const handleSetFiles = props.handleSetFiles;
const [files, setFiles] = useState<FileLite[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState("");
const [dragOver, setDragOver] = useState(false);
const dropzoneRef = useRef<HTMLLabelElement>(null);
const handleFileChange = useCallback(
async (selectedFiles: FileList | null) => {
if (selectedFiles && selectedFiles.length > 0) {
setError("");
if (files.length + selectedFiles.length > props.maxNumFiles) {
setError(`You can only upload up to ${props.maxNumFiles} files.`);
if (dropzoneRef.current) {
(dropzoneRef.current as any).value = "";
}
return;
}
setLoading(true);
const uploadedFiles = await Promise.all(
Array.from(selectedFiles).map(async (file) => {
// Check the file type
if (
file.type.match(
/(text\/plain|application\/(pdf|msword|vnd\.openxmlformats-officedocument\.wordprocessingml\.document)|text\/(markdown|x-markdown))/
) && // AND file isn't too big
file.size < props.maxFileSizeMB * 1024 * 1024
) {
// Check if the file name already exists in the files state
if (files.find((f) => f.name === file.name)) {
return null; // Skip this file
}
const formData = new FormData();
formData.append("file", file);
formData.append("filename", file.name);
try {
const processFileResponse = await axios.post(
"/api/process-file",
formData,
{
headers: {
"Content-Type": "multipart/form-data",
},
}
);
if (processFileResponse.status === 200) {
const text = processFileResponse.data.text;
const meanEmbedding = processFileResponse.data.meanEmbedding;
const chunks = processFileResponse.data.chunks;
const fileObject: FileLite = {
name: file.name,
url: URL.createObjectURL(file),
type: file.type,
size: file.size,
expanded: false,
embedding: meanEmbedding,
chunks,
extractedText: text,
};
console.log(fileObject);
return fileObject;
} else {
console.log("Error creating file embedding");
return null;
}
} catch (err: any) {
console.log(`Error creating file embedding: ${err}`);
return null;
}
} else {
alert(
`Invalid file type or size. Only TXT, PDF, DOCX or MD are allowed, up to ${props.maxFileSizeMB}MB.`
);
return null; // Skip this file
}
})
);
// Filter out any null values from the uploadedFiles array
const validFiles = compact(uploadedFiles);
// Set the files state with the valid files and the existing files
setFiles((prevFiles) => [...prevFiles, ...validFiles]);
handleSetFiles((prevFiles) => [...prevFiles, ...validFiles]);
setLoading(false);
}
},
[files, handleSetFiles, props.maxFileSizeMB, props.maxNumFiles]
);
const handleDragEnter = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(true);
}, []);
const handleDragOver = useCallback((event: React.DragEvent) => {
event.preventDefault();
}, []);
const handleDragLeave = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
}, []);
const handleDrop = useCallback(
(event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
const droppedFiles = event.dataTransfer.files;
handleFileChange(droppedFiles);
},
[handleFileChange]
);
return (
<div className="flex items-center justify-center w-full flex-col">
<label
htmlFor="dropzone-file"
className={`flex flex-col shadow items-center justify-center w-full h-36 border-2 border-gray-300 border-dashed rounded-lg cursor-pointer bg-gray-50 hover:bg-gray-100 relative ${
dragOver ? "border-blue-500 bg-blue-50" : ""
}`}
ref={dropzoneRef}
onDragEnter={handleDragEnter}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDrop}
>
<div className="flex flex-col items-center justify-center pt-5 pb-6">
{loading ? (
<LoadingText text="Uploading..." />
) : (
<div className="text-gray-500 flex flex-col items-center text-center">
<ArrowUpTrayIcon className="w-7 h-7 mb-4" />
<p className="mb-2 text-sm">
<span className="font-semibold">Click to upload</span> or drag
and drop
</p>
<p className="text-xs">
TXT, PDF, DOCX or MD (max {props.maxFileSizeMB}MB per file)
</p>
<p className="text-xs mt-1">
You can upload up to {props.maxNumFiles - files.length} more{" "}
{props.maxNumFiles - files.length === 1 ? "file" : "files"}
</p>
<input
id="dropzone-file"
type="file"
className="hidden"
multiple
onChange={(event) => handleFileChange(event.target.files)}
/>
</div>
)}
</div>
</label>
{error && (
<div className="flex items-center justify-center w-full mt-4">
<p className="text-sm text-red-500">{error}</p>
</div>
)}
<FileViewerList files={files} title="Uploaded Files" />
</div>
);
}
export default memo(FileUploadArea);

@ -1,73 +0,0 @@
import React, { memo, useCallback, useState } from "react";
import { ChevronUpIcon } from "@heroicons/react/24/outline";
import clsx from "clsx";
import { Transition } from "@headlessui/react";
import File from "./File";
import { FileLite } from "../types/file";
type FileViewerListProps = {
files: FileLite[];
title: string;
listExpanded?: boolean;
showScores?: boolean;
};
function FileViewerList(props: FileViewerListProps) {
const [listExpanded, setListExpanded] = useState(props.listExpanded ?? false);
const handleListExpand = useCallback(() => {
setListExpanded((prev) => !prev);
}, []);
return (
<div className="flex items-left justify-center w-full">
{props.files.length > 0 && (
<div className="flex flex-col items-left justify-center w-full mt-4">
<div className="flex flex-row">
<div
className="rounded-md flex shadow p-2 mb-2 w-full bg-gray-50 items-center cursor-pointer "
onClick={handleListExpand}
>
{props.title}
<div className="bg-gray-300 ml-2 px-2 rounded-full w-max text-center text-sm ">
{props.files.length}
</div>
</div>
<div className="ml-auto w-max flex items-center justify-center">
<ChevronUpIcon
className={clsx(
"w-6 h-6 ml-2 stroke-slate-400 transition-transform cursor-pointer",
!listExpanded && "-rotate-180"
)}
onClick={handleListExpand}
/>
</div>
</div>
<Transition
show={listExpanded}
enter="transition duration-125 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="text-sm text-gray-500 space-y-2">
{props.files.map((file) => (
<File
key={file.name}
file={file}
showScore={props.showScores}
/>
))}
</div>
</Transition>
</div>
)}
</div>
);
}
export default memo(FileViewerList);

@ -1,33 +0,0 @@
import clsx from "clsx";
type Props = {
className?: string;
size?: number;
};
export default function LoadingSpinner(props: Props) {
const size = props.size || 5;
return (
<div className={clsx("flex flex-row", props.className)}>
<svg
aria-hidden="true"
className={clsx(
"mr-2 text-gray-200 animate-spin dark:text-gray-600 fill-black stroke-1",
`w-${size} h-${size}`
)}
viewBox="0 0 100 101"
fill="none"
xmlns="http://www.w3.org/2000/svg"
>
<path
d="M100 50.5908C100 78.2051 77.6142 100.591 50 100.591C22.3858 100.591 0 78.2051 0 50.5908C0 22.9766 22.3858 0.59082 50 0.59082C77.6142 0.59082 100 22.9766 100 50.5908ZM9.08144 50.5908C9.08144 73.1895 27.4013 91.5094 50 91.5094C72.5987 91.5094 90.9186 73.1895 90.9186 50.5908C90.9186 27.9921 72.5987 9.67226 50 9.67226C27.4013 9.67226 9.08144 27.9921 9.08144 50.5908Z"
fill="currentColor"
/>
<path
d="M93.9676 39.0409C96.393 38.4038 97.8624 35.9116 97.0079 33.5539C95.2932 28.8227 92.871 24.3692 89.8167 20.348C85.8452 15.1192 80.8826 10.7238 75.2124 7.41289C69.5422 4.10194 63.2754 1.94025 56.7698 1.05124C51.7666 0.367541 46.6976 0.446843 41.7345 1.27873C39.2613 1.69328 37.813 4.19778 38.4501 6.62326C39.0873 9.04874 41.5694 10.4717 44.0505 10.1071C47.8511 9.54855 51.7191 9.52689 55.5402 10.0491C60.8642 10.7766 65.9928 12.5457 70.6331 15.2552C75.2735 17.9648 79.3347 21.5619 82.5849 25.841C84.9175 28.9121 86.7997 32.2913 88.1811 35.8758C89.083 38.2158 91.5421 39.6781 93.9676 39.0409Z"
fill="currentFill"
/>
</svg>
</div>
);
}

@ -1,18 +0,0 @@
import React, { memo } from "react";
import LoadingSpinner from "./LoadingSpinner";
type LoadingTextProps = {
text: string;
};
function LoadingText(props: LoadingTextProps) {
return (
<div className="text-gray-500 text-md flex flex-row justify-center items-center">
<LoadingSpinner />
{props.text && <div className="flex">{props.text}</div>}
</div>
);
}
export default memo(LoadingText);

@ -1,6 +0,0 @@
import "@/styles/globals.css";
import type { AppProps } from "next/app";
export default function App({ Component, pageProps }: AppProps) {
return <Component {...pageProps} />;
}

@ -1,13 +0,0 @@
import { Html, Head, Main, NextScript } from "next/document";
export default function Document() {
return (
<Html lang="en">
<Head />
<body>
<Main />
<NextScript />
</body>
</Html>
);
}

@ -1,74 +0,0 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { completionStream } from "../../services/openai";
import { FileChunk } from "../../types/file";
type Data = {
answer?: string;
error?: string;
};
const MAX_FILES_LENGTH = 2000 * 3;
export default async function handler(
req: NextApiRequest,
res: NextApiResponse<Data>
) {
// Only accept POST requests
if (req.method !== "POST") {
res.status(405).json({ error: "Method not allowed" });
return;
}
const fileChunks = req.body.fileChunks as FileChunk[];
const question = req.body.question as string;
if (!Array.isArray(fileChunks)) {
res.status(400).json({ error: "fileChunks must be an array" });
return;
}
if (!question) {
res.status(400).json({ error: "question must be a string" });
return;
}
try {
const filesString = fileChunks
.map((fileChunk) => `###\n\"${fileChunk.filename}\"\n${fileChunk.text}`)
.join("\n")
.slice(0, MAX_FILES_LENGTH);
const prompt =
`Given a question, try to answer it using the content of the file extracts below, and if you cannot answer, or find a relevant file, just output \"I couldn't find the answer to that question in your files.\".\n\n` +
`If the answer is not contained in the files or if there are no file extracts, respond with \"I couldn't find the answer to that question in your files.\" If the question is not actually a question, respond with \"That's not a valid question.\"\n\n` +
`In the cases where you can find the answer, first give the answer. Then explain how you found the answer from the source or sources, and use the exact filenames of the source files you mention. Do not make up the names of any other files other than those mentioned in the files context. Give the answer in markdown format.` +
`Use the following format:\n\nQuestion: <question>\n\nFiles:\n<###\n\"filename 1\"\nfile text>\n<###\n\"filename 2\"\nfile text>...\n\nAnswer: <answer or "I couldn't find the answer to that question in your files" or "That's not a valid question.">\n\n` +
`Question: ${question}\n\n` +
`Files:\n${filesString}\n\n` +
`Answer:`;
const stream = completionStream({
prompt,
});
// Set the response headers for streaming
res.writeHead(200, {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
Connection: "keep-alive",
});
// Write the data from the stream to the response
for await (const data of stream) {
res.write(data);
}
// End the response when the stream is done
res.end();
} catch (error) {
console.error(error);
res.status(500).json({ error: "Something went wrong" });
}
}

@ -1,69 +0,0 @@
import type { NextApiRequest, NextApiResponse } from "next";
import formidable, { Fields, Files } from "formidable"; // to handle file uploads
import { TextEmbedding } from "../../types/file";
import extractTextFromFile from "../../services/extractTextFromFile";
import { createEmbeddings } from "../../services/createEmbeddings";
// Disable the default body parser to handle file uploads
export const config = { api: { bodyParser: false } };
type Data = {
text?: string;
meanEmbedding?: number[];
chunks?: TextEmbedding[];
error?: string;
};
// This function receives a file as a multipart form and returns the text extracted fom the file and the OpenAI embedding for that text
export default async function handler(
req: NextApiRequest,
res: NextApiResponse<Data>
) {
if (req.method !== "POST") {
res.status(405).json({ error: "Method not allowed" });
return;
}
// Create a formidable instance to parse the request as a multipart form
const options = {
maxFileSize: 30 * 1024 * 1024 // Set the max file size to 30MB
};
const form = formidable(options);
try {
const { fields, files } = await new Promise<{
fields: Fields;
files: Files;
}>((resolve, reject) => {
form.parse(req, (err, fields, files) => {
if (err) {
reject(err);
} else {
resolve({ fields, files } as { fields: Fields; files: Files });
}
});
});
const file = files.file;
if (!file || Array.isArray(file) || file.size === 0) {
res.status(400).json({ error: "Invalid or missing file" });
return;
}
const text = await extractTextFromFile({
filepath: file.filepath,
filetype: file.mimetype ?? "",
});
const { meanEmbedding, chunks } = await createEmbeddings({
text,
});
res.status(200).json({ text, meanEmbedding, chunks });
} catch (error: any) {
res.status(500).json({ error: error.message });
} finally {
// Always send a response, even if it is empty
res.end();
}
}

@ -1,59 +0,0 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { searchFileChunks } from "../../services/searchFileChunks";
import { FileChunk, FileLite } from "../../types/file";
type Data = {
searchResults?: FileChunk[];
error?: string;
};
export const config = {
api: {
bodyParser: {
sizeLimit: "30mb",
},
},
};
export default async function handler(
req: NextApiRequest,
res: NextApiResponse<Data>
) {
try {
const searchQuery = req.body.searchQuery as string;
const files = req.body.files as FileLite[];
const maxResults = req.body.maxResults as number;
if (!searchQuery) {
res.status(400).json({ error: "searchQuery must be a string" });
return;
}
if (!Array.isArray(files) || files.length === 0) {
res.status(400).json({ error: "files must be a non-empty array" });
return;
}
if (!maxResults || maxResults < 1) {
res
.status(400)
.json({ error: "maxResults must be a number greater than 0" });
return;
}
const searchResults = await searchFileChunks({
searchQuery,
files,
maxResults,
});
res.status(200).json({ searchResults });
} catch (error) {
console.error(error);
res.status(500).json({ error: "Something went wrong" });
}
}

@ -1,35 +0,0 @@
import Head from "next/head";
import { useState } from "react";
import FileQandAArea from "../components/FileQandAArea";
import { FileLite } from "../types/file";
import FileUploadArea from "../components/FileUploadArea";
export default function FileQandA() {
const [files, setFiles] = useState<FileLite[]>([]);
return (
<div className="flex items-left text-left h-screen flex-col">
<Head>
<title>File Q&A</title>
</Head>
<div className="max-w-3xl mx-auto m-8 space-y-8 text-gray-800">
<h1 className="text-4xl">File Q&A</h1>
<div className="">
To search for answers from the content in your files, upload them here
and we will use OpenAI embeddings and GPT to find answers from the
relevant documents.
</div>
<FileUploadArea
handleSetFiles={setFiles}
maxNumFiles={75}
maxFileSizeMB={30}
/>
<FileQandAArea files={files} />
</div>
</div>
);
}

@ -1,74 +0,0 @@
// A function that splits a text into smaller pieces of roughly equal length
// The pieces are delimited by sentences and try to avoid breaking words or punctuation
// This can be useful for processing long texts with natural language models that have a limited input size
export function chunkText({
text, // The input text to be split
// The desired maximum length of each piece in characters
// This uses 4 characters as an approximation of the average token length
// since there isn't a good JS tokenizer at the moment
maxCharLength = 250 * 4,
}: {
text: string;
maxCharLength?: number;
}): string[] {
// Create an empty array to store the pieces
const chunks: string[] = [];
// Create a variable to hold the current piece
let currentChunk = "";
// Remove any newline characters from the text and split it by periods
// This assumes that periods mark the end of sentences, which may not be true for some languages
const sentences = text.replace(/\n/g, " ").split(/([.])/);
for (const sentence of sentences) {
// Remove any extra whitespace from the beginning and end of the sentence
const trimmedSentence = sentence.trim();
// If the sentence is empty, skip it
if (!trimmedSentence) continue;
// Check if adding the sentence to the current piece would make it too long, too short, or just right
// This uses a tolerance range of 50% of the maximum length to allow some flexibility
// If the piece is too long, save it and start a new one
// If the piece is too short, add the sentence and continue
// If the piece is just right, save it and start a new one
const chunkLength = currentChunk.length + trimmedSentence.length + 1;
const lowerBound = maxCharLength - maxCharLength * 0.5;
const upperBound = maxCharLength + maxCharLength * 0.5;
if (
chunkLength >= lowerBound &&
chunkLength <= upperBound &&
currentChunk
) {
// The piece is just right, so we save it and start a new one
// We remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, "").trim();
// We only push the piece if it is not empty
if (currentChunk) chunks.push(currentChunk);
// Reset the current piece
currentChunk = "";
} else if (chunkLength > upperBound) {
// The piece is too long, so save it and start a new one with the sentence
// Remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, "").trim();
// We only push the piece if it is not empty
if (currentChunk) chunks.push(currentChunk);
// Set the current piece to the sentence
currentChunk = trimmedSentence;
} else {
// The piece is too short, so add the sentence and continue
// Add a space before the sentence unless it is a period
currentChunk += `${trimmedSentence === "." ? "" : " "}${trimmedSentence}`;
}
}
// If there is any remaining piece, save it
if (currentChunk) {
chunks.push(currentChunk);
}
// Return the array of pieces
return chunks;
}

@ -1,54 +0,0 @@
import { TextEmbedding } from "../types/file";
import { getEmbeddingsForText } from "./getEmbeddingsForText";
export type Embeddings = {
meanEmbedding: number[];
chunks: TextEmbedding[];
};
export async function createEmbeddings({
text,
maxCharLength,
}: {
text: string;
maxCharLength?: number;
}): Promise<Embeddings> {
try {
const textEmbeddings = await getEmbeddingsForText({
text,
maxCharLength,
});
// If there are 0 or 1 embeddings, the mean embedding is the same as the embedding
if (textEmbeddings.length <= 1) {
return {
meanEmbedding: textEmbeddings[0]?.embedding ?? [],
chunks: textEmbeddings,
};
}
// If there are multiple embeddings, calculate their average
const embeddingLength = textEmbeddings[0].embedding.length;
const meanEmbedding = [];
for (let i = 0; i < embeddingLength; i++) {
// Sum up the values at the same index of each embedding
let sum = 0;
for (const textEmbedding of textEmbeddings) {
sum += textEmbedding.embedding[i];
}
// Divide by the number of embeddings to get the mean
meanEmbedding.push(sum / textEmbeddings.length);
}
return {
meanEmbedding,
chunks: textEmbeddings,
};
} catch (error: any) {
console.log("Error: ", error);
return {
meanEmbedding: [],
chunks: [],
};
}
}

@ -1,45 +0,0 @@
import fs from "fs";
import mammoth from "mammoth";
import pdfParse from "pdf-parse";
import { NodeHtmlMarkdown } from "node-html-markdown";
export default async function extractTextFromFile({
filepath,
filetype,
}: {
filepath: string;
filetype: string;
}): Promise<string> {
const buffer: Buffer = await new Promise((resolve, reject) => {
const fileStream = fs.createReadStream(filepath);
const chunks: any[] = [];
fileStream.on("data", (chunk) => {
chunks.push(chunk);
});
fileStream.on("error", (error) => {
reject(error);
});
fileStream.on("end", () => {
resolve(Buffer.concat(chunks));
});
});
// Handle different file types using different modules
switch (filetype) {
case "application/pdf":
const pdfData = await pdfParse(buffer);
return pdfData.text;
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": // i.e. docx file
const docxResult = await mammoth.extractRawText({ path: filepath });
return docxResult.value;
case "text/markdown":
case "text/csv":
case "text/html":
const html = buffer.toString();
return NodeHtmlMarkdown.translate(html);
case "text/plain":
return buffer.toString();
default:
throw new Error("Unsupported file type");
}
}

@ -1,42 +0,0 @@
import { TextEmbedding } from "../types/file";
import { chunkText } from "./chunkText";
import { embedding } from "./openai";
// There isn't a good JS tokenizer at the moment, so we are using this approximation of 4 characters per token instead. This might break for some languages.
const MAX_CHAR_LENGTH = 250 * 4;
// This function takes a text and returns an array of embeddings for each chunk of the text
// The text is split into chunks of a given maximum character length
// The embeddings are computed in batches of a given size
export async function getEmbeddingsForText({
text,
maxCharLength = MAX_CHAR_LENGTH,
batchSize = 20,
}: {
text: string;
maxCharLength?: number;
batchSize?: number;
}): Promise<TextEmbedding[]> {
const textChunks = chunkText({ text, maxCharLength });
const batches = [];
for (let i = 0; i < textChunks.length; i += batchSize) {
batches.push(textChunks.slice(i, i + batchSize));
}
try {
const batchPromises = batches.map((batch) => embedding({ input: batch }));
const embeddings = (await Promise.all(batchPromises)).flat();
const textEmbeddings = embeddings.map((embedding, index) => ({
embedding,
text: textChunks[index],
}));
return textEmbeddings;
} catch (error: any) {
console.log("Error: ", error);
return [];
}
}

@ -1,150 +0,0 @@
import { IncomingMessage } from "http";
import {
ChatCompletionRequestMessageRoleEnum,
Configuration,
CreateChatCompletionResponse,
CreateCompletionRequest,
OpenAIApi,
} from "openai";
// This file contains utility functions for interacting with the OpenAI API
if (!process.env.OPENAI_API_KEY) {
throw new Error("Missing OPENAI_API_KEY environment variable");
}
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY,
});
export const openai = new OpenAIApi(configuration);
type CompletionOptions = Partial<CreateCompletionRequest> & {
prompt: string;
fallback?: string;
};
type EmbeddingOptions = {
input: string | string[];
model?: string;
};
export async function completion({
prompt,
fallback,
max_tokens,
temperature = 0,
model = "gpt-3.5-turbo", // use gpt-4 for better results
}: CompletionOptions) {
try {
// Note: this is not the proper way to use the ChatGPT conversational format, but it works for now
const messages = [
{
role: ChatCompletionRequestMessageRoleEnum.System,
content: prompt ?? "",
},
];
const result = await openai.createChatCompletion({
model,
messages,
temperature,
max_tokens: max_tokens ?? 800,
});
if (!result.data.choices[0].message) {
throw new Error("No text returned from completions endpoint");
}
return result.data.choices[0].message.content;
} catch (error) {
if (fallback) return fallback;
else throw error;
}
}
export async function* completionStream({
prompt,
fallback,
max_tokens = 800,
temperature = 0,
model = "gpt-3.5-turbo", // use gpt-4 for better results
}: CompletionOptions) {
try {
// Note: this is not the proper way to use the ChatGPT conversational format, but it works for now
const messages = [
{
role: ChatCompletionRequestMessageRoleEnum.System,
content: prompt ?? "",
},
];
const result = await openai.createChatCompletion(
{
model,
messages,
temperature,
max_tokens: max_tokens ?? 800,
stream: true,
},
{
responseType: "stream",
}
);
const stream = result.data as any as IncomingMessage;
let buffer = "";
const textDecoder = new TextDecoder();
for await (const chunk of stream) {
buffer += textDecoder.decode(chunk, { stream: true });
const lines = buffer.split("\n");
// Check if the last line is complete
if (buffer.endsWith("\n")) {
buffer = "";
} else {
buffer = lines.pop() || "";
}
for (const line of lines) {
const message = line.trim().split("data: ")[1];
if (message === "[DONE]") {
break;
}
// Check if the message is not undefined and a valid JSON string
if (message) {
try {
const data = JSON.parse(message) as CreateChatCompletionResponse;
// @ts-ignore
if (data.choices[0].delta?.content) {
// @ts-ignore
yield data.choices[0].delta?.content;
}
} catch (error) {
console.error("Error parsing JSON message:", error);
}
}
}
}
} catch (error) {
if (fallback) yield fallback;
else throw error;
}
}
export async function embedding({
input,
model = "text-embedding-ada-002",
}: EmbeddingOptions): Promise<number[][]> {
const result = await openai.createEmbedding({
model,
input,
});
if (!result.data.data[0].embedding) {
throw new Error("No embedding returned from the completions endpoint");
}
// Otherwise, return the embeddings
return result.data.data.map((d) => d.embedding);
}

@ -1,53 +0,0 @@
import { FileLite, FileChunk } from "../types/file";
import { embedding } from "./openai";
// This is the minimum cosine similarity score that a file must have with the search query to be considered relevant
// This is an arbitrary value, and you should vary/ remove this depending on the diversity of your dataset
const COSINE_SIM_THRESHOLD = 0.72;
// This function takes a search query and a list of files, and returns the chunks of text that are most semantically similar to the query
export async function searchFileChunks({
searchQuery,
files,
maxResults,
}: {
searchQuery: string;
files: FileLite[];
maxResults: number;
}): Promise<FileChunk[]> {
// Get the search query embedding
const searchQueryEmbeddingResponse = await embedding({
input: searchQuery,
});
// Get the first element in the embedding array
const searchQueryEmbedding =
searchQueryEmbeddingResponse.length > 0
? searchQueryEmbeddingResponse[0]
: [];
// Rank the chunks by their cosine similarity to the search query (using dot product since the embeddings are normalized) and return this
const rankedChunks = files
// Map each file to an array of chunks with the file name and score
.flatMap((file) =>
file.chunks
? file.chunks.map((chunk) => {
// Calculate the dot product between the chunk embedding and the search query embedding
const dotProduct = chunk.embedding.reduce(
(sum, val, i) => sum + val * searchQueryEmbedding[i],
0
);
// Assign the dot product as the score for the chunk
return { ...chunk, filename: file.name, score: dotProduct };
})
: []
)
// Sort the chunks by their scores in descending order
.sort((a, b) => b.score - a.score)
// Filter the chunks by their score above the threshold
.filter((chunk) => chunk.score > COSINE_SIM_THRESHOLD)
// Take the first maxResults chunks
.slice(0, maxResults);
return rankedChunks;
}

@ -1,19 +0,0 @@
// A function that takes a file name and a string and returns true if the file name is contained in the string
// after removing punctuation and whitespace from both
export const isFileNameInString = (fileName: string, str: string) => {
// Check if the input string is null or undefined
if (!str) {
return false;
}
// Convert both to lowercase and remove punctuation and whitespace
const normalizedFileName = fileName
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
const normalizedStr = str
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
// Return true if the normalized file name is included in the normalized string
return normalizedStr.includes(normalizedFileName);
};

@ -1,5 +0,0 @@
@import "./preflight.css";
@tailwind base;
@tailwind components;
@tailwind utilities;

@ -1,368 +0,0 @@
/* Using a custom preflight to fix conflicts with Ant Design */
/* Original: https://unpkg.com/tailwindcss@3.2.4/src/css/preflight.css */
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box; /* 1 */
border-width: 0; /* 2 */
border-style: solid; /* 2 */
border-color: theme("borderColor.DEFAULT"); /* 2 */
}
::before,
::after {
--tw-content: "";
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
5. Use the user's configured `sans` font-feature-settings by default.
*/
html {
line-height: 1.5; /* 1 */
-webkit-text-size-adjust: 100%; /* 2 */
-moz-tab-size: 4; /* 3 */
tab-size: 4; /* 3 */
font-family: theme("fontFamily.sans"); /* 4 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0; /* 1 */
line-height: inherit; /* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0; /* 1 */
color: inherit; /* 2 */
border-top-width: 1px; /* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: theme("fontFamily.mono"); /* 1 */
font-size: 1em; /* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0; /* 1 */
border-color: inherit; /* 2 */
border-collapse: collapse; /* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit; /* 1 */
font-size: 100%; /* 1 */
font-weight: inherit; /* 1 */
line-height: inherit; /* 1 */
color: inherit; /* 1 */
margin: 0; /* 2 */
padding: 0; /* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type="button"],
[type="reset"],
[type="submit"] {
-webkit-appearance: button; /* 1 */
background-image: none; /* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type="search"] {
-webkit-appearance: textfield; /* 1 */
outline-offset: -2px; /* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button; /* 1 */
font: inherit; /* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::placeholder,
textarea::placeholder {
opacity: 1; /* 1 */
color: theme("colors.gray.400"); /* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block; /* 1 */
vertical-align: middle; /* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/* Make elements with the HTML hidden attribute stay hidden by default */
[hidden] {
display: none;
}

@ -1,21 +0,0 @@
export interface FileLite {
expanded?: boolean;
name: string;
url?: string;
type?: string;
score?: number;
size?: number;
embedding?: number[]; // The file embedding -- or mean embedding if there are multiple embeddings for the file
chunks?: TextEmbedding[]; // The chunks of text and their embeddings
extractedText?: string; // The extracted text from the file
}
export interface FileChunk extends TextEmbedding {
filename: string;
score?: number;
}
export interface TextEmbedding {
text: string;
embedding: number[];
}

@ -1,28 +0,0 @@
const { fontFamily } = require("tailwindcss/defaultTheme");
/** @type {import('tailwindcss').Config} */
module.exports = {
content: [
"./app/**/*.{js,ts,jsx,tsx}",
"./src/**/*.{js,ts,jsx,tsx}",
"./pages/**/*.{js,ts,jsx,tsx}",
"./components/**/*.{js,ts,jsx,tsx}",
],
corePlugins: {
preflight: false,
},
theme: {
extend: {
},
},
keyframes: {
blink: {
"0%, 100%": { opacity: 1 },
"50%": { opacity: 0 },
},
},
plugins: [
require("@tailwindcss/line-clamp"),
require("@tailwindcss/typography"),
],
};

@ -1,24 +0,0 @@
{
"compilerOptions": {
"target": "es5",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
"exclude": ["node_modules"]
}

@ -1,79 +0,0 @@
aiohttp==3.8.5
aiosignal==1.3.1
appnope==0.1.3
asttokens==2.2.1
async-timeout==4.0.2
attrs==22.2.0
backcall==0.2.0
beautifulsoup4==4.11.1
blobfile==2.0.1
bs4==0.0.1
certifi==2023.7.22
charset-normalizer==2.1.1
comm==0.1.2
contourpy==1.0.7
cycler==0.11.0
debugpy==1.6.5
decorator==5.1.1
docopt==0.6.2
entrypoints==0.4
executing==1.2.0
filelock==3.9.0
fonttools==4.38.0
frozenlist==1.3.3
huggingface-hub==0.11.1
idna==3.4
ipykernel==6.20.1
ipython==8.10.0
jedi==0.18.2
joblib==1.2.0
jupyter_client==7.4.8
jupyter_core==5.1.3
kiwisolver==1.4.4
lxml==4.9.2
matplotlib==3.6.3
matplotlib-inline==0.1.6
multidict==6.0.4
nest-asyncio==1.5.6
numpy==1.24.1
openai==0.26.1
packaging==23.0
pandas==1.5.2
parso==0.8.3
pexpect==4.8.0
pickleshare==0.7.5
Pillow==10.0.1
pipreqs==0.4.12
platformdirs==2.6.2
plotly==5.12.0
prompt-toolkit==3.0.36
psutil==5.9.4
ptyprocess==0.7.0
pure-eval==0.2.2
pycryptodomex==3.17
Pygments==2.15.0
pyparsing==3.0.9
python-dateutil==2.8.2
pytz==2022.7.1
PyYAML==6.0
pyzmq==24.0.1
regex==2022.10.31
requests==2.31.0
scikit-learn==1.2.0
scipy==1.10.0
six==1.16.0
soupsieve==2.3.2.post1
stack-data==0.6.2
tenacity==8.1.0
threadpoolctl==3.1.0
tiktoken==0.1.2
tokenizers==0.13.2
tornado==6.3.3
tqdm==4.64.1
traitlets==5.8.1
transformers==4.30.0
typing_extensions==4.4.0
urllib3==1.26.17
wcwidth==0.2.5
yarg==0.1.9
yarl==1.8.2

File diff suppressed because one or more lines are too long

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save