File Q and A app

pull/1077/head
isafulf 2 years ago
parent 3973c85840
commit b844afc6d6

@ -0,0 +1,18 @@
# File Q&A
File Q&A is a [Next.js](https://nextjs.org/) app that lets you find answers in your files using OpenAI APIs. You can upload files and ask questions related to their content, and the app will use embeddings and GPT to generate answers from the most relevant files.
This repo contains two versions of the app:
- `/nextjs`: A standalone Next.js app that stores embeddings locally in the browser. You will need an OpenAI API key to use this app. Read more in its [README](./nextjs/README.md).
- `/nextjs-with-flask-server`: A Next.js app that uses a Flask server as a proxy to access the OpenAI APIs, and Pinecone as a vector database to store embeddings. You will need an OpenAI API key and a Pinecone API key to use this app. Read more in its [README](./nextjs-with-flask-server/README.md).
To run either version of the app, please follow the instructions in the respective README.md files in the subdirectories.
## How it works
When a file is uploaded, text is extracted from the file. This text is then split into shorter text chunks, and an embedding is created for each text chunk. When the user asks a question, an embedding is created for the question, and a similarity search is performed to find the file chunk embeddings that are most similar to the question (i.e. have highest cosine similarities with the question embedding). An API call is then made the the completions endpoint, with the question and the most relevant file chunks are included in the prompt. The generative model then gives the answer to the question found in the file chunks, if the answer can be found in the extracts.
## Limitations
The app may sometimes generate answers that are not in the files, or hallucinate about the existence of files that are not uploaded.

@ -0,0 +1,47 @@
# File Q&A with Next.js and Flask
File Q&A is a web app that lets you find answers in your files. You can upload files and ask questions related to their content, and the app will use embeddings and GPT to generate answers from the most relevant files. \
## Requirements
To run the app, you need:
- An OpenAI API key. You can create a new API key [here](https://beta.openai.com/account/api-keys).
- A Pinecone API key and index name. You can create a new account and index [here](https://www.pinecone.io/).
- Python 3.7 or higher and pipenv for the Flask server.
- Node.js and npm for the Next.js client.
## Set-Up and Development
### Server
Fill out the config.yaml file with your Pinecone API key, index name and environment.
Run the Flask server:
```
cd server
bash script/start "<your OPENAI_API_KEY>"
```
### Client
Navigate to the client directory and install Node dependencies:
```
cd client
npm install
```
Run the Next.js client:
```
cd client
npm run dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the app.
## Limitations
The app may sometimes generate answers that are not in the files, or hallucinate about the existence of files that are not uploaded.

@ -0,0 +1,3 @@
{
"extends": "next/core-web-vitals"
}

@ -0,0 +1,36 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# local env files
.env*.local
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts

@ -0,0 +1,6 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
}
module.exports = nextConfig

File diff suppressed because it is too large Load Diff

@ -0,0 +1,44 @@
{
"name": "file-q-and-a",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@headlessui/react": "^1.7.7",
"@heroicons/react": "^2.0.13",
"@next/font": "13.1.2",
"@tailwindcss/line-clamp": "^0.4.2",
"@tailwindcss/typography": "^0.5.9",
"@types/formidable": "^2.0.5",
"@types/lodash": "^4.14.191",
"@types/node": "18.11.18",
"@types/pdf-parse": "^1.1.1",
"@types/react": "18.0.27",
"@types/react-dom": "18.0.10",
"axios": "^1.2.3",
"clsx": "^1.2.1",
"eslint": "8.32.0",
"eslint-config-next": "13.1.2",
"formidable": "^2.1.1",
"lodash": "^4.17.21",
"mammoth": "^1.5.1",
"next": "13.1.2",
"node-html-markdown": "^1.3.0",
"openai": "^3.1.0",
"pdf-parse": "^1.1.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-markdown": "^8.0.5",
"typescript": "4.9.4"
},
"devDependencies": {
"autoprefixer": "^10.4.13",
"postcss": "^8.4.21",
"tailwindcss": "^3.2.4"
}
}

@ -0,0 +1,6 @@
module.exports = {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 KiB

@ -0,0 +1 @@
<svg id="openai-horizontal" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 120 29.53"><path d="M40.7,6.98s-.05,0-.07,0c-.02,0-.05,0-.07,0-4.67,0-7.58,2.91-7.58,7.6v2.53c0,4.69,2.9,7.6,7.58,7.6,.02,0,.05,0,.07,0,.02,0,.05,0,.07,0,4.67,0,7.58-2.91,7.58-7.6v-2.53c0-4.69-2.91-7.6-7.58-7.6Zm4.31,10.31c0,3.08-1.6,4.86-4.38,4.89-2.78-.03-4.38-1.81-4.38-4.89v-2.88c0-3.08,1.6-4.86,4.38-4.89,2.78,.03,4.38,1.81,4.38,4.89v2.88Zm40.57-5.79s-.06,0-.09,0c-.02,0-.03,0-.05,0-1.77,0-3.03,.6-3.65,1.75l-.19,.35v-1.8h-3.02v12.56h3.17v-7.48c0-1.76,.95-2.77,2.59-2.8,1.57,.03,2.47,1.02,2.47,2.73v7.55h3.17v-8.09c0-2.99-1.64-4.77-4.39-4.77Zm34.42-1.77v-2.4h-10.46v2.4h3.67v12.22h-3.67v2.4h10.46v-2.4h-3.67V9.73h3.67Zm-18.75-2.4h0s-3.28,0-3.28,0l-6.1,17.04h3.43l1.17-3.65h6.66v.04s1.17,3.62,1.17,3.62h3.43l-6.11-17.04h-.36Zm-4.03,10.98l2.57-8.05,2.55,8.05h-5.12Zm-39.45-6.81s-.05,0-.07,0c-.03,0-.05,0-.07,0-1.59,0-2.96,.66-3.68,1.76l-.18,.28v-1.74h-3.02V28.69h3.17v-5.9l.18,.27c.68,1.01,2.01,1.61,3.56,1.61,.03,0,.05,0,.08,0,.02,0,.04,0,.07,0,2.61,0,5.24-1.7,5.24-5.51v-2.14c0-2.74-1.62-5.51-5.26-5.51Zm2.1,7.5c0,2-1.15,3.24-3.01,3.28-1.73-.03-2.94-1.35-2.94-3.23v-1.89c0-1.9,1.22-3.24,2.97-3.28,1.84,.03,2.98,1.28,2.98,3.28v1.84Zm11.05-7.5h0c-.06,0-.12,.01-.18,.01-.06,0-.12-.01-.18-.01h0c-3.57,0-5.78,2.23-5.78,5.81v1.76c0,3.45,2.24,5.59,5.83,5.59,.08,0,.15,0,.22-.01,.05,0,.09,.01,.14,.01,2.41,0,4.09-.88,5.16-2.7l-2.13-1.23c-.71,1.05-1.66,1.84-3.02,1.84-1.82,0-2.91-1.12-2.91-3.01v-.5h8.44v-2.08c0-3.34-2.19-5.49-5.59-5.49Zm-2.86,5.54v-.3c0-2,.95-3.12,2.68-3.2,1.66,.08,2.66,1.18,2.66,2.99v.5s-5.34,0-5.34,0Z"></path><path d="M27.21,12.08c.67-2.01,.44-4.21-.63-6.04-1.61-2.8-4.85-4.24-8.01-3.57C17.16,.89,15.14-.01,13.02,0c-3.23,0-6.1,2.08-7.1,5.15-2.08,.43-3.87,1.73-4.92,3.57-1.62,2.8-1.25,6.32,.92,8.72-.67,2.01-.44,4.21,.63,6.03,1.61,2.81,4.85,4.25,8.02,3.58,1.4,1.58,3.42,2.49,5.54,2.48,3.23,0,6.1-2.08,7.1-5.15,2.08-.43,3.87-1.73,4.91-3.57,1.63-2.8,1.26-6.32-.91-8.72Zm-2.3-5.07c.64,1.12,.88,2.43,.66,3.7-.04-.03-.12-.07-.17-.1l-5.88-3.4c-.3-.17-.67-.17-.97,0l-6.89,3.98v-2.92l5.69-3.29c2.65-1.53,6.03-.62,7.56,2.03Zm-13.25,6.07l2.9-1.68,2.9,1.68v3.35l-2.9,1.68-2.9-1.68v-3.35ZM13.01,1.93c1.3,0,2.55,.45,3.55,1.28-.04,.02-.12,.07-.18,.1l-5.88,3.39c-.3,.17-.48,.49-.48,.84v7.96l-2.53-1.46V7.46c0-3.06,2.47-5.53,5.53-5.54ZM2.68,9.69h0c.65-1.12,1.66-1.98,2.88-2.43v6.99c0,.35,.18,.66,.48,.84l6.88,3.97-2.54,1.47-5.68-3.28c-2.64-1.53-3.55-4.91-2.02-7.56Zm1.55,12.83h0c-.65-1.11-.88-2.43-.66-3.7,.04,.03,.12,.07,.17,.1l5.88,3.4c.3,.17,.67,.17,.97,0l6.88-3.98v2.92l-5.69,3.28c-2.65,1.52-6.03,.62-7.56-2.02Zm11.89,5.08c-1.29,0-2.55-.45-3.54-1.28,.04-.02,.13-.07,.18-.1l5.88-3.39c.3-.17,.49-.49,.48-.84v-7.95l2.53,1.46v6.57c0,3.06-2.48,5.54-5.53,5.54Zm10.34-7.76c-.65,1.12-1.67,1.98-2.88,2.42v-6.99c0-.35-.18-.67-.48-.84h0l-6.89-3.98,2.53-1.46,5.69,3.28c2.65,1.53,3.55,4.91,2.02,7.56Z"></path></svg>

After

Width:  |  Height:  |  Size: 2.8 KiB

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" id="openai-symbol" viewBox="0 0 32 32"><path d="M29.71,13.09A8.09,8.09,0,0,0,20.34,2.68a8.08,8.08,0,0,0-13.7,2.9A8.08,8.08,0,0,0,2.3,18.9,8,8,0,0,0,3,25.45a8.08,8.08,0,0,0,8.69,3.87,8,8,0,0,0,6,2.68,8.09,8.09,0,0,0,7.7-5.61,8,8,0,0,0,5.33-3.86A8.09,8.09,0,0,0,29.71,13.09Zm-12,16.82a6,6,0,0,1-3.84-1.39l.19-.11,6.37-3.68a1,1,0,0,0,.53-.91v-9l2.69,1.56a.08.08,0,0,1,.05.07v7.44A6,6,0,0,1,17.68,29.91ZM4.8,24.41a6,6,0,0,1-.71-4l.19.11,6.37,3.68a1,1,0,0,0,1,0l7.79-4.49V22.8a.09.09,0,0,1,0,.08L13,26.6A6,6,0,0,1,4.8,24.41ZM3.12,10.53A6,6,0,0,1,6.28,7.9v7.57a1,1,0,0,0,.51.9l7.75,4.47L11.85,22.4a.14.14,0,0,1-.09,0L5.32,18.68a6,6,0,0,1-2.2-8.18Zm22.13,5.14-7.78-4.52L20.16,9.6a.08.08,0,0,1,.09,0l6.44,3.72a6,6,0,0,1-.9,10.81V16.56A1.06,1.06,0,0,0,25.25,15.67Zm2.68-4-.19-.12-6.36-3.7a1,1,0,0,0-1.05,0l-7.78,4.49V9.2a.09.09,0,0,1,0-.09L19,5.4a6,6,0,0,1,8.91,6.21ZM11.08,17.15,8.38,15.6a.14.14,0,0,1-.05-.08V8.1a6,6,0,0,1,9.84-4.61L18,3.6,11.61,7.28a1,1,0,0,0-.53.91ZM12.54,14,16,12l3.47,2v4L16,20l-3.47-2Z"/></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

@ -0,0 +1,77 @@
import { useState, useCallback, memo } from "react";
import { Transition } from "@headlessui/react";
import {
MagnifyingGlassMinusIcon,
MagnifyingGlassPlusIcon,
ArrowTopRightOnSquareIcon,
} from "@heroicons/react/24/outline";
import { FileLite } from "../types/file";
type FileProps = {
file: FileLite;
showScore?: boolean;
};
function File(props: FileProps) {
const [expanded, setExpanded] = useState(false);
const handleExpand = useCallback(() => {
setExpanded((prev) => !prev);
}, []);
return (
<div
className="border-gray-100 border rounded-md shadow p-2 cursor-pointer"
onClick={handleExpand}
>
<div className="flex flex-row justify-between">
<div className="flex hover:text-gray-600">{props.file.name}</div>
<div className="flex flex-row space-x-2">
{props.showScore && props.file.score && (
<div className="flex text-blue-600 mr-4">
{props.file.score.toFixed(2)}
</div>
)}
<div className="ml-auto w-max flex items-center justify-center">
{expanded ? (
<MagnifyingGlassMinusIcon className="text-gray-500 h-5" />
) : (
<MagnifyingGlassPlusIcon className="text-gray-500 h-5" />
)}
</div>
<a
href={props.file.url}
target="_blank"
rel="noopener noreferrer"
onClick={(e) => e.stopPropagation()} // prevent the click event from bubbling up to the list item
>
<ArrowTopRightOnSquareIcon className="text-gray-500 h-5" />
</a>
</div>
</div>
<Transition
show={expanded}
enter="transition duration-75 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-100 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="items-center mt-2 justify-center">
<iframe
src={props.file.url}
className="h-full w-full"
title={props.file.name}
></iframe>
</div>
</Transition>
</div>
);
}
export default memo(File);

@ -0,0 +1,147 @@
import React, { memo, useCallback, useRef, useState } from "react";
import { Transition } from "@headlessui/react";
import axios from "axios";
import ReactMarkdown from "react-markdown";
import FileViewerList from "./FileViewerList";
import LoadingText from "./LoadingText";
import { isFileNameInString } from "../services/utils";
import { FileChunk, FileLite } from "../types/file";
import { SERVER_ADDRESS } from "../types/constants";
type FileQandAAreaProps = {
files: FileLite[];
};
function FileQandAArea(props: FileQandAAreaProps) {
const searchBarRef = useRef(null);
const [answerError, setAnswerError] = useState("");
const [searchResultsLoading, setSearchResultsLoading] =
useState<boolean>(false);
const [answer, setAnswer] = useState("");
const handleSearch = useCallback(async () => {
if (searchResultsLoading) {
return;
}
const question = (searchBarRef?.current as any)?.value ?? "";
setAnswer("");
if (!question) {
setAnswerError("Please ask a question.");
return;
}
if (props.files.length === 0) {
setAnswerError("Please upload files before asking a question.");
return;
}
setSearchResultsLoading(true);
setAnswerError("");
let results: FileChunk[] = [];
try {
const answerResponse = await axios.post(
`${SERVER_ADDRESS}/answer_question`,
{
question,
}
);
if (answerResponse.status === 200) {
setAnswer(answerResponse.data.answer);
} else {
setAnswerError("Sorry, something went wrong!");
}
} catch (err: any) {
setAnswerError("Sorry, something went wrong!");
}
setSearchResultsLoading(false);
}, [props.files, searchResultsLoading]);
const handleEnterInSearchBar = useCallback(
async (event: React.SyntheticEvent) => {
if ((event as any).key === "Enter") {
await handleSearch();
}
},
[handleSearch]
);
return (
<div className="space-y-4 text-gray-800">
<div className="mt-2">
Ask a question based on the content of your files:
</div>
<div className="space-y-2">
<input
className="border rounded border-gray-200 w-full py-1 px-2"
placeholder="e.g. What were the key takeaways from the Q1 planning meeting?"
name="search"
ref={searchBarRef}
onKeyDown={handleEnterInSearchBar}
/>
<div
className="rounded-md bg-gray-50 py-1 px-4 w-max text-gray-500 hover:bg-gray-100 border border-gray-100 shadow cursor-pointer"
onClick={handleSearch}
>
{searchResultsLoading ? (
<LoadingText text="Answering question..." />
) : (
"Ask question"
)}
</div>
</div>
<div className="">
{answerError && <div className="text-red-500">{answerError}</div>}
<Transition
show={answer !== ""}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
{/* answer from files */}
{answer && (
<div className="">
<ReactMarkdown className="prose" linkTarget="_blank">
{answer}
</ReactMarkdown>
</div>
)}
<Transition
show={
props.files.filter((file) =>
isFileNameInString(file.name, answer)
).length > 0
}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
<FileViewerList
files={props.files.filter((file) =>
isFileNameInString(file.name, answer)
)}
title="Sources"
listExpanded={true}
/>
</Transition>
</Transition>
</div>
</div>
);
}
export default memo(FileQandAArea);

@ -0,0 +1,195 @@
import React, {
Dispatch,
SetStateAction,
useCallback,
useState,
memo,
useRef,
} from "react";
import axios from "axios";
import { ArrowUpTrayIcon } from "@heroicons/react/24/outline";
import { compact } from "lodash";
import LoadingText from "./LoadingText";
import { FileLite } from "../types/file";
import FileViewerList from "./FileViewerList";
import { SERVER_ADDRESS } from "../types/constants";
type FileUploadAreaProps = {
handleSetFiles: Dispatch<SetStateAction<FileLite[]>>;
maxNumFiles: number;
maxFileSizeMB: number;
};
function FileUploadArea(props: FileUploadAreaProps) {
const handleSetFiles = props.handleSetFiles;
const [files, setFiles] = useState<FileLite[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState("");
const [dragOver, setDragOver] = useState(false);
const dropzoneRef = useRef<HTMLLabelElement>(null);
const handleFileChange = useCallback(
async (selectedFiles: FileList | null) => {
if (selectedFiles && selectedFiles.length > 0) {
setError("");
if (files.length + selectedFiles.length > props.maxNumFiles) {
setError(`You can only upload up to ${props.maxNumFiles} files.`);
if (dropzoneRef.current) {
(dropzoneRef.current as any).value = "";
}
return;
}
setLoading(true);
const uploadedFiles = await Promise.all(
Array.from(selectedFiles).map(async (file) => {
// Check the file type
if (
file.type.match(
/(text\/plain|application\/(pdf|msword|vnd\.openxmlformats-officedocument\.wordprocessingml\.document))/
) && // AND file isnt too big
file.size < props.maxFileSizeMB * 1024 * 1024
) {
// Check if the file name already exists in the files state
if (files.find((f) => f.name === file.name)) {
return null; // skip this file
}
const formData = new FormData();
formData.append("file", file);
try {
const processFileResponse = await axios.post(
`${SERVER_ADDRESS}/process_file`,
formData,
{
headers: {
"Content-Type": "multipart/form-data",
},
}
);
if (
processFileResponse.status === 200 &&
processFileResponse.data.success
) {
const fileObject: FileLite = {
name: file.name,
url: URL.createObjectURL(file),
expanded: false,
};
console.log(fileObject);
return fileObject;
} else {
console.log("Error processing file");
return null;
}
} catch (err: any) {
console.log(`error processing file: ${err}`);
return null;
}
} else {
alert(
`Invalid file type or size. Only TXT, PD or DOCX are allowed, up to ${props.maxFileSizeMB}MB.`
);
return null; // Skip this file
}
})
);
// Filter out any null values from the uploadedFiles array
const validFiles = compact(uploadedFiles);
// Set the files state with the valid files and the existing files
setFiles((prevFiles) => [...prevFiles, ...validFiles]);
handleSetFiles((prevFiles) => [...prevFiles, ...validFiles]);
setLoading(false);
}
},
[files, handleSetFiles, props.maxFileSizeMB, props.maxNumFiles]
);
const handleDragEnter = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(true);
}, []);
const handleDragOver = useCallback((event: React.DragEvent) => {
event.preventDefault();
}, []);
const handleDragLeave = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
}, []);
const handleDrop = useCallback(
(event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
const droppedFiles = event.dataTransfer.files;
handleFileChange(droppedFiles);
},
[handleFileChange]
);
return (
<div className="flex items-center justify-center w-full flex-col">
<label
htmlFor="dropzone-file"
className={`flex flex-col shadow items-center justify-center w-full h-36 border-2 border-gray-300 border-dashed rounded-lg cursor-pointer bg-gray-50 hover:bg-gray-100 relative ${
dragOver ? "border-blue-500 bg-blue-50" : ""
}`}
ref={dropzoneRef}
onDragEnter={handleDragEnter}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDrop}
>
<div className="flex flex-col items-center justify-center pt-5 pb-6">
{loading ? (
<LoadingText text="Uploading..." />
) : (
<div className="text-gray-500 flex flex-col items-center text-center">
<ArrowUpTrayIcon className="w-7 h-7 mb-4" />
<p className="mb-2 text-sm">
<span className="font-semibold">Click to upload</span> or drag
and drop
</p>
<p className="text-xs">
PDF, DOCX or TXT (max {props.maxFileSizeMB}MB per file)
</p>
<p className="text-xs mt-1">
You can upload up to {props.maxNumFiles - files.length} more{" "}
{props.maxNumFiles - files.length === 1 ? "file" : "files"}
</p>
<input
id="dropzone-file"
type="file"
className="hidden"
multiple
onChange={(event) => handleFileChange(event.target.files)}
/>
</div>
)}
</div>
</label>
{error && (
<div className="flex items-center justify-center w-full mt-4">
<p className="text-sm text-red-500">{error}</p>
</div>
)}
<FileViewerList files={files} title="Uploaded Files" />
</div>
);
}
export default memo(FileUploadArea);

@ -0,0 +1,73 @@
import React, { memo, useCallback, useState } from "react";
import { ChevronUpIcon } from "@heroicons/react/24/outline";
import clsx from "clsx";
import { Transition } from "@headlessui/react";
import File from "./File";
import { FileLite } from "../types/file";
type FileViewerListProps = {
files: FileLite[];
title: string;
listExpanded?: boolean;
showScores?: boolean;
};
function FileViewerList(props: FileViewerListProps) {
const [listExpanded, setListExpanded] = useState(props.listExpanded ?? false);
const handleListExpand = useCallback(() => {
setListExpanded((prev) => !prev);
}, []);
return (
<div className="flex items-left justify-center w-full">
{props.files.length > 0 && (
<div className="flex flex-col items-left justify-center w-full mt-4">
<div className="flex flex-row">
<div
className="rounded-md flex shadow p-2 mb-2 w-full bg-gray-50 items-center cursor-pointer "
onClick={handleListExpand}
>
{props.title}
<div className="bg-gray-300 ml-2 px-2 rounded-full w-max text-center text-sm ">
{props.files.length}
</div>
</div>
<div className="ml-auto w-max flex items-center justify-center">
<ChevronUpIcon
className={clsx(
"w-6 h-6 ml-2 stroke-slate-400 transition-transform cursor-pointer",
!listExpanded && "-rotate-180"
)}
onClick={handleListExpand}
/>
</div>
</div>
<Transition
show={listExpanded}
enter="transition duration-125 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="text-sm text-gray-500 space-y-2">
{props.files.map((file) => (
<File
key={file.name}
file={file}
showScore={props.showScores}
/>
))}
</div>
</Transition>
</div>
)}
</div>
);
}
export default memo(FileViewerList);

@ -0,0 +1,33 @@
import clsx from "clsx";
type Props = {
className?: string;
size?: number;
};
export default function LoadingSpinner(props: Props) {
const size = props.size || 5;
return (
<div className={clsx("flex flex-row", props.className)}>
<svg
aria-hidden="true"
className={clsx(
"mr-2 text-gray-200 animate-spin dark:text-gray-600 fill-black stroke-1",
`w-${size} h-${size}`
)}
viewBox="0 0 100 101"
fill="none"
xmlns="http://www.w3.org/2000/svg"
>
<path
d="M100 50.5908C100 78.2051 77.6142 100.591 50 100.591C22.3858 100.591 0 78.2051 0 50.5908C0 22.9766 22.3858 0.59082 50 0.59082C77.6142 0.59082 100 22.9766 100 50.5908ZM9.08144 50.5908C9.08144 73.1895 27.4013 91.5094 50 91.5094C72.5987 91.5094 90.9186 73.1895 90.9186 50.5908C90.9186 27.9921 72.5987 9.67226 50 9.67226C27.4013 9.67226 9.08144 27.9921 9.08144 50.5908Z"
fill="currentColor"
/>
<path
d="M93.9676 39.0409C96.393 38.4038 97.8624 35.9116 97.0079 33.5539C95.2932 28.8227 92.871 24.3692 89.8167 20.348C85.8452 15.1192 80.8826 10.7238 75.2124 7.41289C69.5422 4.10194 63.2754 1.94025 56.7698 1.05124C51.7666 0.367541 46.6976 0.446843 41.7345 1.27873C39.2613 1.69328 37.813 4.19778 38.4501 6.62326C39.0873 9.04874 41.5694 10.4717 44.0505 10.1071C47.8511 9.54855 51.7191 9.52689 55.5402 10.0491C60.8642 10.7766 65.9928 12.5457 70.6331 15.2552C75.2735 17.9648 79.3347 21.5619 82.5849 25.841C84.9175 28.9121 86.7997 32.2913 88.1811 35.8758C89.083 38.2158 91.5421 39.6781 93.9676 39.0409Z"
fill="currentFill"
/>
</svg>
</div>
);
}

@ -0,0 +1,18 @@
import React, { memo } from "react";
import LoadingSpinner from "./LoadingSpinner";
type LoadingTextProps = {
text: string;
};
function LoadingText(props: LoadingTextProps) {
return (
<div className="text-gray-500 text-md flex flex-row justify-center items-center">
<LoadingSpinner />
{props.text && <div className="flex">{props.text}</div>}
</div>
);
}
export default memo(LoadingText);

@ -0,0 +1,6 @@
import "@/styles/globals.css";
import type { AppProps } from "next/app";
export default function App({ Component, pageProps }: AppProps) {
return <Component {...pageProps} />;
}

@ -0,0 +1,13 @@
import { Html, Head, Main, NextScript } from "next/document";
export default function Document() {
return (
<Html lang="en">
<Head />
<body>
<Main />
<NextScript />
</body>
</Html>
);
}

@ -0,0 +1,35 @@
import Head from "next/head";
import { useState } from "react";
import FileQandAArea from "../components/FileQandAArea";
import { FileLite } from "../types/file";
import FileUploadArea from "../components/FileUploadArea";
export default function FileQandA() {
const [files, setFiles] = useState<FileLite[]>([]);
return (
<div className="flex items-left text-left h-screen flex-col">
<Head>
<title>File Q&A</title>
</Head>
<div className="max-w-3xl mx-auto m-8 space-y-8 text-gray-800">
<h1 className="text-4xl">File Q&A</h1>
<div className="">
To search for answers from the content in your files, upload them here
and we will use OpenAI embeddings and GPT to find answers from the
relevant documents.
</div>
<FileUploadArea
handleSetFiles={setFiles}
maxNumFiles={75}
maxFileSizeMB={30}
/>
<FileQandAArea files={files} />
</div>
</div>
);
}

@ -0,0 +1,14 @@
// A function that takes a file name and a string and returns true if the file name is contained in the string
// after removing punctuation and whitespace from both
export const isFileNameInString = (fileName: string, str: string) => {
// Convert both to lowercase and remove punctuation and whitespace
const normalizedFileName = fileName
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
const normalizedStr = str
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
// Return true if the normalized file name is included in the normalized string
return normalizedStr.includes(normalizedFileName);
};

@ -0,0 +1,5 @@
@import "./preflight.css";
@tailwind base;
@tailwind components;
@tailwind utilities;

@ -0,0 +1,368 @@
/* Using a custom preflight to fix conflicts with Ant Design */
/* Original: https://unpkg.com/tailwindcss@3.2.4/src/css/preflight.css */
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box; /* 1 */
border-width: 0; /* 2 */
border-style: solid; /* 2 */
border-color: theme("borderColor.DEFAULT"); /* 2 */
}
::before,
::after {
--tw-content: "";
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
5. Use the user's configured `sans` font-feature-settings by default.
*/
html {
line-height: 1.5; /* 1 */
-webkit-text-size-adjust: 100%; /* 2 */
-moz-tab-size: 4; /* 3 */
tab-size: 4; /* 3 */
font-family: theme("fontFamily.sans"); /* 4 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0; /* 1 */
line-height: inherit; /* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0; /* 1 */
color: inherit; /* 2 */
border-top-width: 1px; /* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: theme("fontFamily.mono"); /* 1 */
font-size: 1em; /* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0; /* 1 */
border-color: inherit; /* 2 */
border-collapse: collapse; /* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit; /* 1 */
font-size: 100%; /* 1 */
font-weight: inherit; /* 1 */
line-height: inherit; /* 1 */
color: inherit; /* 1 */
margin: 0; /* 2 */
padding: 0; /* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type="button"],
[type="reset"],
[type="submit"] {
-webkit-appearance: button; /* 1 */
background-image: none; /* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type="search"] {
-webkit-appearance: textfield; /* 1 */
outline-offset: -2px; /* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button; /* 1 */
font: inherit; /* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::placeholder,
textarea::placeholder {
opacity: 1; /* 1 */
color: theme("colors.gray.400"); /* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block; /* 1 */
vertical-align: middle; /* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/* Make elements with the HTML hidden attribute stay hidden by default */
[hidden] {
display: none;
}

@ -0,0 +1 @@
export const SERVER_ADDRESS = "http://localhost:8080";

@ -0,0 +1,21 @@
export interface FileLite {
expanded?: boolean;
name: string;
url?: string;
type?: string;
score?: number;
size?: number;
embedding?: number[]; // The file embedding -- or mean embedding if there are multiple embeddings for the file
chunks?: TextEmbedding[]; // The chunks of text and their embeddings
extractedText?: string; // The extracted text from the file
}
export interface FileChunk extends TextEmbedding {
filename: string;
score?: number;
}
export interface TextEmbedding {
text: string;
embedding: number[];
}

@ -0,0 +1,28 @@
const { fontFamily } = require("tailwindcss/defaultTheme");
/** @type {import('tailwindcss').Config} */
module.exports = {
content: [
"./app/**/*.{js,ts,jsx,tsx}",
"./src/**/*.{js,ts,jsx,tsx}",
"./pages/**/*.{js,ts,jsx,tsx}",
"./components/**/*.{js,ts,jsx,tsx}",
],
corePlugins: {
preflight: false,
},
theme: {
extend: {
},
},
keyframes: {
blink: {
"0%, 100%": { opacity: 1 },
"50%": { opacity: 0 },
},
},
plugins: [
require("@tailwindcss/line-clamp"),
require("@tailwindcss/typography"),
],
};

@ -0,0 +1,24 @@
{
"compilerOptions": {
"target": "es5",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
"exclude": ["node_modules"]
}

@ -0,0 +1,77 @@
from utils import get_embedding
from flask import jsonify
from config import *
from flask import current_app
import openai
from config import *
TOP_K = 10
def get_answer_from_files(question, session_id, pinecone_index):
logging.info(f"Getting answer for question: {question}")
search_query_embedding = get_embedding(question, EMBEDDINGS_MODEL)
try:
query_response = pinecone_index.query(
namespace=session_id,
top_k=TOP_K,
include_values=False,
include_metadata=True,
vector=search_query_embedding,
)
logging.info(
f"[get_answer_from_files] received query response from Pinecone: {query_response}")
files_string = ""
file_text_dict = current_app.config["file_text_dict"]
for i in range(len(query_response.matches)):
result = query_response.matches[i]
file_chunk_id = result.id
score = result.score
filename = result.metadata["filename"]
file_text = file_text_dict.get(file_chunk_id)
file_string = f"###\n\"{filename}\"\n{file_text}\n"
if score < COSINE_SIM_THRESHOLD and i > 0:
logging.info(
f"[get_answer_from_files] score {score} is below threshold {COSINE_SIM_THRESHOLD} and i is {i}, breaking")
break
files_string += file_string
prompt = f"Given a question, try to answer it using the content of the file extracts below, and if you cannot answer, or find " \
f"a relevant file, just output \"I couldn't find the answer to that question in your files.\".\n\n" \
f"If the answer is not contained in the files or if there are no file extracts, respond with \"I couldn't find the answer " \
f"to that question in your files.\" If the question is not actually a question, respond with \"That's not a valid question.\"\n\n" \
f"In the cases where you can find the answer, first give the answer. Then explain how you found the answer from the source or sources, " \
f"and use the exact filenames of the source files you mention. Do not make up the names of any other files other than those mentioned "\
f"in the files context. Give the answer in markdown format." \
f"Use the following format:\n\nQuestion: <question>\n\nFiles:\n<###\n\"filename 1\"\nfile text>\n<###\n\"filename 2\"\nfile text>...\n\n"\
f"Answer: <answer or \"I couldn't find the answer to that question in your files\" or \"That's not a valid question.\">\n\n" \
f"Question: {question}\n\n" \
f"Files:\n{files_string}\n" \
f"Answer:"
logging.info(f"[get_answer_from_files] prompt: {prompt}")
response = openai.Completion.create(
prompt=prompt,
temperature=0,
max_tokens=1000,
top_p=1,
frequency_penalty=0,
presence_penalty=0,
engine=GENERATIVE_MODEL,
)
answer = response.choices[0].text.strip()
logging.info(f"[get_answer_from_files] answer: {answer}")
return jsonify({"answer": answer})
except Exception as e:
logging.info(f"[get_answer_from_files] error: {e}")
return str(e)

@ -0,0 +1,100 @@
from __future__ import print_function
from config import *
import tiktoken
import pinecone
import uuid
import sys
import logging
from flask import Flask, jsonify
from flask_cors import CORS, cross_origin
from flask import request
from handle_file import handle_file
from answer_question import get_answer_from_files
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
def load_pinecone_index() -> pinecone.Index:
"""
Load index from Pinecone, raise error if the index can't be found.
"""
pinecone.init(
api_key=PINECONE_API_KEY,
environment=PINECONE_ENV,
)
index_name = PINECONE_INDEX
if not index_name in pinecone.list_indexes():
print(pinecone.list_indexes())
raise KeyError(f"Index '{index_name}' does not exist.")
index = pinecone.Index(index_name)
return index
def create_app():
pinecone_index = load_pinecone_index()
tokenizer = tiktoken.get_encoding("gpt2")
session_id = str(uuid.uuid4().hex)
app = Flask(__name__)
app.pinecone_index = pinecone_index
app.tokenizer = tokenizer
app.session_id = session_id
# log session id
logging.info(f"session_id: {session_id}")
app.config["file_text_dict"] = {}
CORS(app, supports_credentials=True)
return app
app = create_app()
@app.route(f"/process_file", methods=["POST"])
@cross_origin(supports_credentials=True)
def process_file():
try:
file = request.files['file']
logging.info(str(file))
handle_file(
file, app.session_id, app.pinecone_index, app.tokenizer)
return jsonify({"success": True})
except Exception as e:
logging.error(str(e))
return jsonify({"success": False})
@app.route(f"/answer_question", methods=["POST"])
@cross_origin(supports_credentials=True)
def answer_question():
try:
params = request.get_json()
question = params["question"]
answer_question_response = get_answer_from_files(
question, app.session_id, app.pinecone_index)
return answer_question_response
except Exception as e:
return str(e)
@app.route("/healthcheck", methods=["GET"])
@cross_origin(supports_credentials=True)
def healthcheck():
return "OK"
if __name__ == "__main__":
app.run(debug=True, port=SERVER_PORT, threaded=True)

@ -0,0 +1,35 @@
from pathlib import Path
import logging
import sys
from pprint import pformat
import yaml
# Load config items from config.yaml.
# Use Path.resolve() to get the absolute path of the parent directory
yaml_dir = Path(__file__).resolve().parent
yaml_path = yaml_dir / "config.yaml" # Use Path / operator to join paths
def load_yaml_config(path):
"""Load a yaml file and return a dictionary of its contents."""
try:
with open(path, "r") as stream:
return yaml.safe_load(stream)
except yaml.YAMLError as exc:
logging.error(f"Failed to load {path}: {exc}")
return None
# Load the config and update the global variables
yaml_config = load_yaml_config(yaml_path)
if yaml_config is not None:
logging.info(f"Loaded config from {yaml_path}:")
logging.info(pformat(yaml_config))
globals().update(yaml_config)
else:
logging.error(f"Could not load config from {yaml_path}.")
sys.exit(1) # Exit the program if the config is invalid
# Set a default value for SERVER_PORT if not specified in the config
SERVER_PORT = yaml_config.get("SERVER_PORT", None)
# Use Path.resolve() to get the absolute path of the current directory
SERVER_DIR = Path(__file__).resolve().parent

@ -0,0 +1,18 @@
# ----- PINECONE CONFIG -----
PINECONE_API_KEY: "<your Pinecone API key>"
PINECONE_INDEX: "<your Pinecone Index name>" # dimensions: 1536, metric: cosine similarity
PINECONE_ENV: "<your Pinecone env e.g.us-west1-gcp>"
# ----- SERVER PORT ----
SERVER_PORT: "8080"
# ---- OPENAI CONFIG -----
EMBEDDINGS_MODEL: "text-embedding-ada-002"
GENERATIVE_MODEL: "text-davinci-003"
EMBEDDING_DIMENSIONS: 1536
TEXT_EMBEDDING_CHUNK_SIZE: 200
# This is the minimum cosine similarity score that a file must have with the search query to be considered relevant
# This is an arbitrary value, and you should vary/ remove this depending on the diversity of your dataset
COSINE_SIM_THRESHOLD: 0.7
MAX_TEXTS_TO_EMBED_BATCH_SIZE: 100
MAX_PINECONE_VECTORS_TO_UPSERT_PATCH_SIZE: 100

@ -0,0 +1,168 @@
import logging
import sys
import docx2txt
from PyPDF2 import PdfReader
from numpy import array, average
from flask import current_app
from config import *
from utils import get_embeddings, get_pinecone_id_for_file_chunk
# Set up logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
# Handle a file by extracting its text, creating embeddings, and upserting them to Pinecone
def handle_file(file, session_id, pinecone_index, tokenizer):
"""Handle a file by extracting its text, creating embeddings, and upserting them to Pinecone."""
filename = file.filename
logging.info("[handle_file] Handling file: {}".format(filename))
# Get the file text dict from the current app config
file_text_dict = current_app.config["file_text_dict"]
# Extract text from the file
try:
extracted_text = extract_text_from_file(file)
except ValueError as e:
logging.error(
"[handle_file] Error extracting text from file: {}".format(e))
raise e
# Save extracted text to file text dict
file_text_dict[filename] = extracted_text
# Handle the extracted text as a string
return handle_file_string(filename, session_id, extracted_text, pinecone_index, tokenizer, file_text_dict)
# Extract text from a file based on its mimetype
def extract_text_from_file(file):
"""Return the text content of a file."""
if file.mimetype == "application/pdf":
# Extract text from pdf using PyPDF2
reader = PdfReader(file)
extracted_text = ""
for page in reader.pages:
extracted_text += page.extract_text()
elif file.mimetype == "text/plain":
# Read text from plain text file
extracted_text = file.read().decode("utf-8")
file.close()
elif file.mimetype == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
# Extract text from docx using docx2txt
extracted_text = docx2txt.process(file)
else:
# Unsupported file type
raise ValueError("Unsupported file type: {}".format(file.mimetype))
return extracted_text
# Handle a file string by creating embeddings and upserting them to Pinecone
def handle_file_string(filename, session_id, file_body_string, pinecone_index, tokenizer, file_text_dict):
"""Handle a file string by creating embeddings and upserting them to Pinecone."""
logging.info("[handle_file_string] Starting...")
# Clean up the file string by replacing newlines and double spaces
clean_file_body_string = file_body_string.replace(
"\n", "; ").replace(" ", " ")
# Add the filename to the text to embed
text_to_embed = "Filename is: {}; {}".format(
filename, clean_file_body_string)
# Create embeddings for the text
try:
text_embeddings, average_embedding = create_embeddings_for_text(
text_to_embed, tokenizer)
logging.info(
"[handle_file_string] Created embedding for {}".format(filename))
except Exception as e:
logging.error(
"[handle_file_string] Error creating embedding: {}".format(e))
raise e
# Get the vectors array of triples: file_chunk_id, embedding, metadata for each embedding
# Metadata is a dict with keys: filename, file_chunk_index
vectors = []
for i, (text_chunk, embedding) in enumerate(text_embeddings):
id = get_pinecone_id_for_file_chunk(session_id, filename, i)
file_text_dict[id] = text_chunk
vectors.append(
(id, embedding, {"filename": filename, "file_chunk_index": i}))
logging.info(
"[handle_file_string] Text chunk {}: {}".format(i, text_chunk))
# Split the vectors array into smaller batches of max length 2000
batch_size = MAX_PINECONE_VECTORS_TO_UPSERT_PATCH_SIZE
batches = [vectors[i:i+batch_size] for i in range(0, len(vectors), batch_size)]
# Upsert each batch to Pinecone
for batch in batches:
try:
pinecone_index.upsert(
vectors=batch, namespace=session_id)
logging.info(
"[handle_file_string] Upserted batch of embeddings for {}".format(filename))
except Exception as e:
logging.error(
"[handle_file_string] Error upserting batch of embeddings to Pinecone: {}".format(e))
raise e
# Compute the column-wise average of a list of lists
def get_col_average_from_list_of_lists(list_of_lists):
"""Return the average of each column in a list of lists."""
if len(list_of_lists) == 1:
return list_of_lists[0]
else:
list_of_lists_array = array(list_of_lists)
average_embedding = average(list_of_lists_array, axis=0)
return average_embedding.tolist()
# Create embeddings for a text using a tokenizer and an OpenAI engine
def create_embeddings_for_text(text, tokenizer):
"""Return a list of tuples (text_chunk, embedding) and an average embedding for a text."""
token_chunks = list(chunks(text, TEXT_EMBEDDING_CHUNK_SIZE, tokenizer))
text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]
# Split text_chunks into shorter arrays of max length 10
text_chunks_arrays = [text_chunks[i:i+MAX_TEXTS_TO_EMBED_BATCH_SIZE] for i in range(0, len(text_chunks), MAX_TEXTS_TO_EMBED_BATCH_SIZE)]
# Call get_embeddings for each shorter array and combine the results
embeddings = []
for text_chunks_array in text_chunks_arrays:
embeddings_response = get_embeddings(text_chunks_array, EMBEDDINGS_MODEL)
embeddings.extend([embedding["embedding"] for embedding in embeddings_response])
text_embeddings = list(zip(text_chunks, embeddings))
average_embedding = get_col_average_from_list_of_lists(embeddings)
return (text_embeddings, average_embedding)
# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def chunks(text, n, tokenizer):
tokens = tokenizer.encode(text)
"""Yield successive n-sized chunks from text."""
i = 0
while i < len(tokens):
# Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
j = min(i + int(1.5 * n), len(tokens))
while j > i + int(0.5 * n):
# Decode the tokens and check for full stop or newline
chunk = tokenizer.decode(tokens[i:j])
if chunk.endswith(".") or chunk.endswith("\n"):
break
j -= 1
# If no end of sentence found, use n tokens as the chunk size
if j == i + int(0.5 * n):
j = min(i + n, len(tokens))
yield tokens[i:j]
i = j

@ -0,0 +1,11 @@
Flask-Cors==3.0.10
openai==0.13.0
pinecone-client==2.0.13
PyPDF2==2.10.4
numpy==1.23.2
scikit-learn==1.1.2
docx2txt==0.8
flask>=1.1.4
jinja2==3.0.1
PyYAML==6.0
tiktoken==0.1.2

@ -0,0 +1,10 @@
#!/bin/bash
set -e
echo "Starting Python server..."
pip3 install virtualenv
python3 -m virtualenv venv
source venv/bin/activate
pip3 install -r requirements.txt
OPENAI_API_KEY=$1 python3 app.py

@ -0,0 +1,38 @@
import openai
import logging
import sys
import time
from config import *
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("debug.log"),
logging.StreamHandler(sys.stdout)
]
)
def get_pinecone_id_for_file_chunk(session_id, filename, chunk_index):
return str(session_id+"-!"+filename+"-!"+str(chunk_index))
def get_embedding(text, engine):
return openai.Engine(id=engine).embeddings(input=[text])["data"][0]["embedding"]
def get_embeddings(text_array, engine):
# Parameters for exponential backoff
max_retries = 5 # Maximum number of retries
base_delay = 1 # Base delay in seconds
factor = 2 # Factor to multiply the delay by after each retry
while True:
try:
return openai.Engine(id=engine).embeddings(input=text_array)["data"]
except Exception as e:
if max_retries > 0:
logging.info(f"Request failed. Retrying in {base_delay} seconds.")
time.sleep(base_delay)
max_retries -= 1
base_delay *= factor
else:
raise e

@ -0,0 +1,4 @@
# create a copy of this file named .env.local
# Your own API key for OpenAI
OPENAI_API_KEY='sk-......'

@ -0,0 +1,3 @@
{
"extends": "next/core-web-vitals"
}

@ -0,0 +1,36 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.js
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# local env files
.env*.local
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts

@ -0,0 +1,39 @@
# File Q&A
File Q&A is a [Next.js](https://nextjs.org/) app that lets you find answers in your files using OpenAI APIs. You can upload files and ask questions related to their content, and the app will use embeddings and GPT to generate answers from the most relevant files.
## Requirements
To run the app, you need an OpenAI API key. You can create a new API key [here](https://beta.openai.com/account/api-keys).
## Set Up
If you don't have Node.js and npm already, install them from [https://nodejs.org/en/download/](https://nodejs.org/en/download/).
In your terminal, navigate to the `nextjs` directory of this example app, and then install dependencies:
```
npm install
```
Copy the .env.local.example file into a .env.local file and fill out the OpenAI API key field.
## Development
Run the development server:
```
npm run dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the app.
## Deployment
You can deploy the app on [Vercel](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme), the platform from the creators of Next.js. Check out the [Next.js deployment documentation](https://nextjs.org/docs/deployment) for more details.
## Limitations
Uploaded files and generated embeddings don't persist on browser refresh. If you want to store more embeddings, we recommend using a vector database (e.g. Pinecone, Weaviate, Milvus, Qdrant, Redis, FAISS, etc.). The `nextjs-with-flask-server` version of this demo uses a Pinecone vector database.
The app may sometimes generate answers that are not in the files, or hallucinate about the existence of files that are not uploaded.

@ -0,0 +1,6 @@
/** @type {import('next').NextConfig} */
const nextConfig = {
reactStrictMode: true,
}
module.exports = nextConfig

File diff suppressed because it is too large Load Diff

@ -0,0 +1,44 @@
{
"name": "file-q-and-a",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev",
"build": "next build",
"start": "next start",
"lint": "next lint"
},
"dependencies": {
"@headlessui/react": "^1.7.7",
"@heroicons/react": "^2.0.13",
"@next/font": "13.1.2",
"@tailwindcss/line-clamp": "^0.4.2",
"@tailwindcss/typography": "^0.5.9",
"@types/formidable": "^2.0.5",
"@types/lodash": "^4.14.191",
"@types/node": "18.11.18",
"@types/pdf-parse": "^1.1.1",
"@types/react": "18.0.27",
"@types/react-dom": "18.0.10",
"axios": "^1.2.3",
"clsx": "^1.2.1",
"eslint": "8.32.0",
"eslint-config-next": "13.1.2",
"formidable": "^2.1.1",
"lodash": "^4.17.21",
"mammoth": "^1.5.1",
"next": "13.1.2",
"node-html-markdown": "^1.3.0",
"openai": "^3.1.0",
"pdf-parse": "^1.1.1",
"react": "18.2.0",
"react-dom": "18.2.0",
"react-markdown": "^8.0.5",
"typescript": "4.9.4"
},
"devDependencies": {
"autoprefixer": "^10.4.13",
"postcss": "^8.4.21",
"tailwindcss": "^3.2.4"
}
}

@ -0,0 +1,6 @@
module.exports = {
plugins: {
tailwindcss: {},
autoprefixer: {},
},
};

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 KiB

@ -0,0 +1 @@
<svg id="openai-horizontal" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 120 29.53"><path d="M40.7,6.98s-.05,0-.07,0c-.02,0-.05,0-.07,0-4.67,0-7.58,2.91-7.58,7.6v2.53c0,4.69,2.9,7.6,7.58,7.6,.02,0,.05,0,.07,0,.02,0,.05,0,.07,0,4.67,0,7.58-2.91,7.58-7.6v-2.53c0-4.69-2.91-7.6-7.58-7.6Zm4.31,10.31c0,3.08-1.6,4.86-4.38,4.89-2.78-.03-4.38-1.81-4.38-4.89v-2.88c0-3.08,1.6-4.86,4.38-4.89,2.78,.03,4.38,1.81,4.38,4.89v2.88Zm40.57-5.79s-.06,0-.09,0c-.02,0-.03,0-.05,0-1.77,0-3.03,.6-3.65,1.75l-.19,.35v-1.8h-3.02v12.56h3.17v-7.48c0-1.76,.95-2.77,2.59-2.8,1.57,.03,2.47,1.02,2.47,2.73v7.55h3.17v-8.09c0-2.99-1.64-4.77-4.39-4.77Zm34.42-1.77v-2.4h-10.46v2.4h3.67v12.22h-3.67v2.4h10.46v-2.4h-3.67V9.73h3.67Zm-18.75-2.4h0s-3.28,0-3.28,0l-6.1,17.04h3.43l1.17-3.65h6.66v.04s1.17,3.62,1.17,3.62h3.43l-6.11-17.04h-.36Zm-4.03,10.98l2.57-8.05,2.55,8.05h-5.12Zm-39.45-6.81s-.05,0-.07,0c-.03,0-.05,0-.07,0-1.59,0-2.96,.66-3.68,1.76l-.18,.28v-1.74h-3.02V28.69h3.17v-5.9l.18,.27c.68,1.01,2.01,1.61,3.56,1.61,.03,0,.05,0,.08,0,.02,0,.04,0,.07,0,2.61,0,5.24-1.7,5.24-5.51v-2.14c0-2.74-1.62-5.51-5.26-5.51Zm2.1,7.5c0,2-1.15,3.24-3.01,3.28-1.73-.03-2.94-1.35-2.94-3.23v-1.89c0-1.9,1.22-3.24,2.97-3.28,1.84,.03,2.98,1.28,2.98,3.28v1.84Zm11.05-7.5h0c-.06,0-.12,.01-.18,.01-.06,0-.12-.01-.18-.01h0c-3.57,0-5.78,2.23-5.78,5.81v1.76c0,3.45,2.24,5.59,5.83,5.59,.08,0,.15,0,.22-.01,.05,0,.09,.01,.14,.01,2.41,0,4.09-.88,5.16-2.7l-2.13-1.23c-.71,1.05-1.66,1.84-3.02,1.84-1.82,0-2.91-1.12-2.91-3.01v-.5h8.44v-2.08c0-3.34-2.19-5.49-5.59-5.49Zm-2.86,5.54v-.3c0-2,.95-3.12,2.68-3.2,1.66,.08,2.66,1.18,2.66,2.99v.5s-5.34,0-5.34,0Z"></path><path d="M27.21,12.08c.67-2.01,.44-4.21-.63-6.04-1.61-2.8-4.85-4.24-8.01-3.57C17.16,.89,15.14-.01,13.02,0c-3.23,0-6.1,2.08-7.1,5.15-2.08,.43-3.87,1.73-4.92,3.57-1.62,2.8-1.25,6.32,.92,8.72-.67,2.01-.44,4.21,.63,6.03,1.61,2.81,4.85,4.25,8.02,3.58,1.4,1.58,3.42,2.49,5.54,2.48,3.23,0,6.1-2.08,7.1-5.15,2.08-.43,3.87-1.73,4.91-3.57,1.63-2.8,1.26-6.32-.91-8.72Zm-2.3-5.07c.64,1.12,.88,2.43,.66,3.7-.04-.03-.12-.07-.17-.1l-5.88-3.4c-.3-.17-.67-.17-.97,0l-6.89,3.98v-2.92l5.69-3.29c2.65-1.53,6.03-.62,7.56,2.03Zm-13.25,6.07l2.9-1.68,2.9,1.68v3.35l-2.9,1.68-2.9-1.68v-3.35ZM13.01,1.93c1.3,0,2.55,.45,3.55,1.28-.04,.02-.12,.07-.18,.1l-5.88,3.39c-.3,.17-.48,.49-.48,.84v7.96l-2.53-1.46V7.46c0-3.06,2.47-5.53,5.53-5.54ZM2.68,9.69h0c.65-1.12,1.66-1.98,2.88-2.43v6.99c0,.35,.18,.66,.48,.84l6.88,3.97-2.54,1.47-5.68-3.28c-2.64-1.53-3.55-4.91-2.02-7.56Zm1.55,12.83h0c-.65-1.11-.88-2.43-.66-3.7,.04,.03,.12,.07,.17,.1l5.88,3.4c.3,.17,.67,.17,.97,0l6.88-3.98v2.92l-5.69,3.28c-2.65,1.52-6.03,.62-7.56-2.02Zm11.89,5.08c-1.29,0-2.55-.45-3.54-1.28,.04-.02,.13-.07,.18-.1l5.88-3.39c.3-.17,.49-.49,.48-.84v-7.95l2.53,1.46v6.57c0,3.06-2.48,5.54-5.53,5.54Zm10.34-7.76c-.65,1.12-1.67,1.98-2.88,2.42v-6.99c0-.35-.18-.67-.48-.84h0l-6.89-3.98,2.53-1.46,5.69,3.28c2.65,1.53,3.55,4.91,2.02,7.56Z"></path></svg>

After

Width:  |  Height:  |  Size: 2.8 KiB

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" id="openai-symbol" viewBox="0 0 32 32"><path d="M29.71,13.09A8.09,8.09,0,0,0,20.34,2.68a8.08,8.08,0,0,0-13.7,2.9A8.08,8.08,0,0,0,2.3,18.9,8,8,0,0,0,3,25.45a8.08,8.08,0,0,0,8.69,3.87,8,8,0,0,0,6,2.68,8.09,8.09,0,0,0,7.7-5.61,8,8,0,0,0,5.33-3.86A8.09,8.09,0,0,0,29.71,13.09Zm-12,16.82a6,6,0,0,1-3.84-1.39l.19-.11,6.37-3.68a1,1,0,0,0,.53-.91v-9l2.69,1.56a.08.08,0,0,1,.05.07v7.44A6,6,0,0,1,17.68,29.91ZM4.8,24.41a6,6,0,0,1-.71-4l.19.11,6.37,3.68a1,1,0,0,0,1,0l7.79-4.49V22.8a.09.09,0,0,1,0,.08L13,26.6A6,6,0,0,1,4.8,24.41ZM3.12,10.53A6,6,0,0,1,6.28,7.9v7.57a1,1,0,0,0,.51.9l7.75,4.47L11.85,22.4a.14.14,0,0,1-.09,0L5.32,18.68a6,6,0,0,1-2.2-8.18Zm22.13,5.14-7.78-4.52L20.16,9.6a.08.08,0,0,1,.09,0l6.44,3.72a6,6,0,0,1-.9,10.81V16.56A1.06,1.06,0,0,0,25.25,15.67Zm2.68-4-.19-.12-6.36-3.7a1,1,0,0,0-1.05,0l-7.78,4.49V9.2a.09.09,0,0,1,0-.09L19,5.4a6,6,0,0,1,8.91,6.21ZM11.08,17.15,8.38,15.6a.14.14,0,0,1-.05-.08V8.1a6,6,0,0,1,9.84-4.61L18,3.6,11.61,7.28a1,1,0,0,0-.53.91ZM12.54,14,16,12l3.47,2v4L16,20l-3.47-2Z"/></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

@ -0,0 +1,77 @@
import { useState, useCallback, memo } from "react";
import { Transition } from "@headlessui/react";
import {
MagnifyingGlassMinusIcon,
MagnifyingGlassPlusIcon,
ArrowTopRightOnSquareIcon,
} from "@heroicons/react/24/outline";
import { FileLite } from "../types/file";
type FileProps = {
file: FileLite;
showScore?: boolean;
};
function File(props: FileProps) {
const [expanded, setExpanded] = useState(false);
const handleExpand = useCallback(() => {
setExpanded((prev) => !prev);
}, []);
return (
<div
className="border-gray-100 border rounded-md shadow p-2 cursor-pointer"
onClick={handleExpand}
>
<div className="flex flex-row justify-between">
<div className="flex hover:text-gray-600">{props.file.name}</div>
<div className="flex flex-row space-x-2">
{props.showScore && props.file.score && (
<div className="flex text-blue-600 mr-4">
{props.file.score.toFixed(2)}
</div>
)}
<div className="ml-auto w-max flex items-center justify-center">
{expanded ? (
<MagnifyingGlassMinusIcon className="text-gray-500 h-5" />
) : (
<MagnifyingGlassPlusIcon className="text-gray-500 h-5" />
)}
</div>
<a
href={props.file.url}
target="_blank"
rel="noopener noreferrer"
onClick={(e) => e.stopPropagation()} // prevent the click event from bubbling up to the list item
>
<ArrowTopRightOnSquareIcon className="text-gray-500 h-5" />
</a>
</div>
</div>
<Transition
show={expanded}
enter="transition duration-75 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-100 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="items-center mt-2 justify-center">
<iframe
src={props.file.url}
className="h-full w-full"
title={props.file.name}
></iframe>
</div>
</Transition>
</div>
);
}
export default memo(File);

@ -0,0 +1,172 @@
import React, { memo, useCallback, useRef, useState } from "react";
import { Transition } from "@headlessui/react";
import axios from "axios";
import ReactMarkdown from "react-markdown";
import FileViewerList from "./FileViewerList";
import LoadingText from "./LoadingText";
import { isFileNameInString } from "../services/utils";
import { FileChunk, FileLite } from "../types/file";
type FileQandAAreaProps = {
files: FileLite[];
};
function FileQandAArea(props: FileQandAAreaProps) {
const questionRef = useRef(null);
const [hasAskedQuestion, setHasAskedQuestion] = useState(false);
const [answerError, setAnswerError] = useState("");
const [answerLoading, setAnswerLoading] = useState<boolean>(false);
const [answer, setAnswer] = useState("");
const [answerDone, setAnswerDone] = useState(false);
const handleSearch = useCallback(async () => {
if (answerLoading) {
return;
}
const question = (questionRef?.current as any)?.value ?? "";
setAnswer("");
setAnswerDone(false);
if (!question) {
setAnswerError("Please ask a question.");
return;
}
if (props.files.length === 0) {
setAnswerError("Please upload files before asking a question.");
return;
}
setAnswerLoading(true);
setAnswerError("");
let results: FileChunk[] = [];
try {
const searchResultsResponse = await axios.post(
"/api/search-file-chunks",
{
searchQuery: question,
files: props.files,
maxResults: 10,
}
);
if (searchResultsResponse.status === 200) {
results = searchResultsResponse.data.searchResults;
} else {
setAnswerError("Sorry, something went wrong!");
}
} catch (err: any) {
setAnswerError("Sorry, something went wrong!");
}
setHasAskedQuestion(true);
const res = await fetch("/api/get-answer-from-files", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({
question,
fileChunks: results,
}),
});
const reader = res.body!.getReader();
while (true) {
const { done, value } = await reader.read();
if (done) {
setAnswerDone(true);
break;
}
setAnswer((prev) => prev + new TextDecoder().decode(value));
}
setAnswerLoading(false);
}, [props.files, answerLoading]);
const handleEnterInSearchBar = useCallback(
async (event: React.SyntheticEvent) => {
if ((event as any).key === "Enter") {
await handleSearch();
}
},
[handleSearch]
);
return (
<div className="space-y-4 text-gray-800">
<div className="mt-2">
Ask a question based on the content of your files:
</div>
<div className="space-y-2">
<input
className="border rounded border-gray-200 w-full py-1 px-2"
placeholder="e.g. What were the key takeaways from the Q1 planning meeting?"
name="search"
ref={questionRef}
onKeyDown={handleEnterInSearchBar}
/>
<div
className="rounded-md bg-gray-50 py-1 px-4 w-max text-gray-500 hover:bg-gray-100 border border-gray-100 shadow cursor-pointer"
onClick={handleSearch}
>
{answerLoading ? (
<LoadingText text="Answering question..." />
) : (
"Ask question"
)}
</div>
</div>
<div className="">
{answerError && <div className="text-red-500">{answerError}</div>}
<Transition
show={hasAskedQuestion}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
{answer && (
<div className="">
<ReactMarkdown className="prose" linkTarget="_blank">
{`${answer}${answerDone ? "" : " |"}`}
</ReactMarkdown>
</div>
)}
<Transition
show={
props.files.filter((file) =>
isFileNameInString(file.name, answer)
).length > 0
}
enter="transition duration-600 ease-out"
enterFrom="transform opacity-0"
enterTo="transform opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform opacity-100"
leaveTo="transform opacity-0"
className="mb-8"
>
<FileViewerList
files={props.files.filter((file) =>
isFileNameInString(file.name, answer)
)}
title="Sources"
listExpanded={true}
/>
</Transition>
</Transition>
</div>
</div>
);
}
export default memo(FileQandAArea);

@ -0,0 +1,201 @@
import React, {
Dispatch,
SetStateAction,
useCallback,
useState,
memo,
useRef,
} from "react";
import axios from "axios";
import { ArrowUpTrayIcon } from "@heroicons/react/24/outline";
import { compact } from "lodash";
import LoadingText from "./LoadingText";
import { FileLite } from "../types/file";
import FileViewerList from "./FileViewerList";
type FileUploadAreaProps = {
handleSetFiles: Dispatch<SetStateAction<FileLite[]>>;
maxNumFiles: number;
maxFileSizeMB: number;
};
function FileUploadArea(props: FileUploadAreaProps) {
const handleSetFiles = props.handleSetFiles;
const [files, setFiles] = useState<FileLite[]>([]);
const [loading, setLoading] = useState(false);
const [error, setError] = useState("");
const [dragOver, setDragOver] = useState(false);
const dropzoneRef = useRef<HTMLLabelElement>(null);
const handleFileChange = useCallback(
async (selectedFiles: FileList | null) => {
if (selectedFiles && selectedFiles.length > 0) {
setError("");
if (files.length + selectedFiles.length > props.maxNumFiles) {
setError(`You can only upload up to ${props.maxNumFiles} files.`);
if (dropzoneRef.current) {
(dropzoneRef.current as any).value = "";
}
return;
}
setLoading(true);
const uploadedFiles = await Promise.all(
Array.from(selectedFiles).map(async (file) => {
// Check the file type
if (
file.type.match(
/(text\/plain|application\/(pdf|msword|vnd\.openxmlformats-officedocument\.wordprocessingml\.document)|text\/(markdown|x-markdown))/
) && // AND file isn't too big
file.size < props.maxFileSizeMB * 1024 * 1024
) {
// Check if the file name already exists in the files state
if (files.find((f) => f.name === file.name)) {
return null; // Skip this file
}
const formData = new FormData();
formData.append("file", file);
formData.append("filename", file.name);
try {
const processFileResponse = await axios.post(
"/api/process-file",
formData,
{
headers: {
"Content-Type": "multipart/form-data",
},
}
);
if (processFileResponse.status === 200) {
const text = processFileResponse.data.text;
const meanEmbedding = processFileResponse.data.meanEmbedding;
const chunks = processFileResponse.data.chunks;
const fileObject: FileLite = {
name: file.name,
url: URL.createObjectURL(file),
type: file.type,
size: file.size,
expanded: false,
embedding: meanEmbedding,
chunks,
extractedText: text,
};
console.log(fileObject);
return fileObject;
} else {
console.log("Error creating file embedding");
return null;
}
} catch (err: any) {
console.log(`Error creating file embedding: ${err}`);
return null;
}
} else {
alert(
`Invalid file type or size. Only TXT, PDF, DOCX or MD are allowed, up to ${props.maxFileSizeMB}MB.`
);
return null; // Skip this file
}
})
);
// Filter out any null values from the uploadedFiles array
const validFiles = compact(uploadedFiles);
// Set the files state with the valid files and the existing files
setFiles((prevFiles) => [...prevFiles, ...validFiles]);
handleSetFiles((prevFiles) => [...prevFiles, ...validFiles]);
setLoading(false);
}
},
[files, handleSetFiles, props.maxFileSizeMB, props.maxNumFiles]
);
const handleDragEnter = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(true);
}, []);
const handleDragOver = useCallback((event: React.DragEvent) => {
event.preventDefault();
}, []);
const handleDragLeave = useCallback((event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
}, []);
const handleDrop = useCallback(
(event: React.DragEvent) => {
event.preventDefault();
setDragOver(false);
const droppedFiles = event.dataTransfer.files;
handleFileChange(droppedFiles);
},
[handleFileChange]
);
return (
<div className="flex items-center justify-center w-full flex-col">
<label
htmlFor="dropzone-file"
className={`flex flex-col shadow items-center justify-center w-full h-36 border-2 border-gray-300 border-dashed rounded-lg cursor-pointer bg-gray-50 hover:bg-gray-100 relative ${
dragOver ? "border-blue-500 bg-blue-50" : ""
}`}
ref={dropzoneRef}
onDragEnter={handleDragEnter}
onDragOver={handleDragOver}
onDragLeave={handleDragLeave}
onDrop={handleDrop}
>
<div className="flex flex-col items-center justify-center pt-5 pb-6">
{loading ? (
<LoadingText text="Uploading..." />
) : (
<div className="text-gray-500 flex flex-col items-center text-center">
<ArrowUpTrayIcon className="w-7 h-7 mb-4" />
<p className="mb-2 text-sm">
<span className="font-semibold">Click to upload</span> or drag
and drop
</p>
<p className="text-xs">
TXT, PDF, DOCX or MD (max {props.maxFileSizeMB}MB per file)
</p>
<p className="text-xs mt-1">
You can upload up to {props.maxNumFiles - files.length} more{" "}
{props.maxNumFiles - files.length === 1 ? "file" : "files"}
</p>
<input
id="dropzone-file"
type="file"
className="hidden"
multiple
onChange={(event) => handleFileChange(event.target.files)}
/>
</div>
)}
</div>
</label>
{error && (
<div className="flex items-center justify-center w-full mt-4">
<p className="text-sm text-red-500">{error}</p>
</div>
)}
<FileViewerList files={files} title="Uploaded Files" />
</div>
);
}
export default memo(FileUploadArea);

@ -0,0 +1,73 @@
import React, { memo, useCallback, useState } from "react";
import { ChevronUpIcon } from "@heroicons/react/24/outline";
import clsx from "clsx";
import { Transition } from "@headlessui/react";
import File from "./File";
import { FileLite } from "../types/file";
type FileViewerListProps = {
files: FileLite[];
title: string;
listExpanded?: boolean;
showScores?: boolean;
};
function FileViewerList(props: FileViewerListProps) {
const [listExpanded, setListExpanded] = useState(props.listExpanded ?? false);
const handleListExpand = useCallback(() => {
setListExpanded((prev) => !prev);
}, []);
return (
<div className="flex items-left justify-center w-full">
{props.files.length > 0 && (
<div className="flex flex-col items-left justify-center w-full mt-4">
<div className="flex flex-row">
<div
className="rounded-md flex shadow p-2 mb-2 w-full bg-gray-50 items-center cursor-pointer "
onClick={handleListExpand}
>
{props.title}
<div className="bg-gray-300 ml-2 px-2 rounded-full w-max text-center text-sm ">
{props.files.length}
</div>
</div>
<div className="ml-auto w-max flex items-center justify-center">
<ChevronUpIcon
className={clsx(
"w-6 h-6 ml-2 stroke-slate-400 transition-transform cursor-pointer",
!listExpanded && "-rotate-180"
)}
onClick={handleListExpand}
/>
</div>
</div>
<Transition
show={listExpanded}
enter="transition duration-125 ease-out"
enterFrom="transform translate-y-4 opacity-0"
enterTo="transform translate-y-0 opacity-100"
leave="transition duration-125 ease-out"
leaveFrom="transform translate-y-0 opacity-100"
leaveTo="transform translate-y-4 opacity-0"
>
<div className="text-sm text-gray-500 space-y-2">
{props.files.map((file) => (
<File
key={file.name}
file={file}
showScore={props.showScores}
/>
))}
</div>
</Transition>
</div>
)}
</div>
);
}
export default memo(FileViewerList);

@ -0,0 +1,33 @@
import clsx from "clsx";
type Props = {
className?: string;
size?: number;
};
export default function LoadingSpinner(props: Props) {
const size = props.size || 5;
return (
<div className={clsx("flex flex-row", props.className)}>
<svg
aria-hidden="true"
className={clsx(
"mr-2 text-gray-200 animate-spin dark:text-gray-600 fill-black stroke-1",
`w-${size} h-${size}`
)}
viewBox="0 0 100 101"
fill="none"
xmlns="http://www.w3.org/2000/svg"
>
<path
d="M100 50.5908C100 78.2051 77.6142 100.591 50 100.591C22.3858 100.591 0 78.2051 0 50.5908C0 22.9766 22.3858 0.59082 50 0.59082C77.6142 0.59082 100 22.9766 100 50.5908ZM9.08144 50.5908C9.08144 73.1895 27.4013 91.5094 50 91.5094C72.5987 91.5094 90.9186 73.1895 90.9186 50.5908C90.9186 27.9921 72.5987 9.67226 50 9.67226C27.4013 9.67226 9.08144 27.9921 9.08144 50.5908Z"
fill="currentColor"
/>
<path
d="M93.9676 39.0409C96.393 38.4038 97.8624 35.9116 97.0079 33.5539C95.2932 28.8227 92.871 24.3692 89.8167 20.348C85.8452 15.1192 80.8826 10.7238 75.2124 7.41289C69.5422 4.10194 63.2754 1.94025 56.7698 1.05124C51.7666 0.367541 46.6976 0.446843 41.7345 1.27873C39.2613 1.69328 37.813 4.19778 38.4501 6.62326C39.0873 9.04874 41.5694 10.4717 44.0505 10.1071C47.8511 9.54855 51.7191 9.52689 55.5402 10.0491C60.8642 10.7766 65.9928 12.5457 70.6331 15.2552C75.2735 17.9648 79.3347 21.5619 82.5849 25.841C84.9175 28.9121 86.7997 32.2913 88.1811 35.8758C89.083 38.2158 91.5421 39.6781 93.9676 39.0409Z"
fill="currentFill"
/>
</svg>
</div>
);
}

@ -0,0 +1,18 @@
import React, { memo } from "react";
import LoadingSpinner from "./LoadingSpinner";
type LoadingTextProps = {
text: string;
};
function LoadingText(props: LoadingTextProps) {
return (
<div className="text-gray-500 text-md flex flex-row justify-center items-center">
<LoadingSpinner />
{props.text && <div className="flex">{props.text}</div>}
</div>
);
}
export default memo(LoadingText);

@ -0,0 +1,6 @@
import "@/styles/globals.css";
import type { AppProps } from "next/app";
export default function App({ Component, pageProps }: AppProps) {
return <Component {...pageProps} />;
}

@ -0,0 +1,13 @@
import { Html, Head, Main, NextScript } from "next/document";
export default function Document() {
return (
<Html lang="en">
<Head />
<body>
<Main />
<NextScript />
</body>
</Html>
);
}

@ -0,0 +1,77 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { completionStream } from "../../services/openai";
import { FileChunk } from "../../types/file";
type Data = {
answer?: string;
error?: string;
};
const MAX_FILES_LENGTH = 2000 * 3;
export default async function handler(
req: NextApiRequest,
res: NextApiResponse<Data>
) {
// Only accept POST requests
if (req.method !== "POST") {
res.status(405).json({ error: "Method not allowed" });
return;
}
const fileChunks = req.body.fileChunks as FileChunk[];
const question = req.body.question as string;
if (!Array.isArray(fileChunks)) {
res.status(400).json({ error: "fileChunks must be an array" });
return;
}
if (!question) {
res.status(400).json({ error: "question must be a string" });
return;
}
try {
const filesString = fileChunks
.map((fileChunk) => `###\n\"${fileChunk.filename}\"\n${fileChunk.text}`)
.join("\n")
.slice(0, MAX_FILES_LENGTH);
console.log(filesString);
const prompt =
`Given a question, try to answer it using the content of the file extracts below, and if you cannot answer, or find a relevant file, just output \"I couldn't find the answer to that question in your files.\".\n\n` +
`If the answer is not contained in the files or if there are no file extracts, respond with \"I couldn't find the answer to that question in your files.\" If the question is not actually a question, respond with \"That's not a valid question.\"\n\n` +
`In the cases where you can find the answer, first give the answer. Then explain how you found the answer from the source or sources, and use the exact filenames of the source files you mention. Do not make up the names of any other files other than those mentioned in the files context. Give the answer in markdown format.` +
`Use the following format:\n\nQuestion: <question>\n\nFiles:\n<###\n\"filename 1\"\nfile text>\n<###\n\"filename 2\"\nfile text>...\n\nAnswer: <answer or "I couldn't find the answer to that question in your files" or "That's not a valid question.">\n\n` +
`Question: ${question}\n\n` +
`Files:\n${filesString}\n\n` +
`Answer:`;
const stream = completionStream({
prompt,
model: "text-davinci-003",
});
// Set the response headers for streaming
res.writeHead(200, {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache, no-transform",
Connection: "keep-alive",
});
// Write the data from the stream to the response
for await (const data of stream) {
res.write(data);
}
// End the response when the stream is done
res.end();
} catch (error) {
console.error(error);
res.status(500).json({ error: "Something went wrong" });
}
}

@ -0,0 +1,65 @@
import type { NextApiRequest, NextApiResponse } from "next";
import formidable, { Fields, Files } from "formidable"; // to handle file uploads
import { TextEmbedding } from "../../types/file";
import extractTextFromFile from "../../services/extractTextFromFile";
import { createEmbeddings } from "../../services/createEmbeddings";
// Disable the default body parser to handle file uploads
export const config = { api: { bodyParser: false } };
type Data = {
text?: string;
meanEmbedding?: number[];
chunks?: TextEmbedding[];
error?: string;
};
// This function receives a file as a multipart form and returns the text extracted fom the file and the OpenAI embedding for that text
export default async function handler(
req: NextApiRequest,
res: NextApiResponse<Data>
) {
if (req.method !== "POST") {
res.status(405).json({ error: "Method not allowed" });
return;
}
// Create a formidable instance to parse the request as a multipart form
const form = new formidable.IncomingForm();
try {
const { fields, files } = await new Promise<{
fields: Fields;
files: Files;
}>((resolve, reject) => {
form.parse(req, (err, fields, files) => {
if (err) {
reject(err);
} else {
resolve({ fields, files } as { fields: Fields; files: Files });
}
});
});
const file = files.file;
if (!file || Array.isArray(file) || file.size === 0) {
res.status(400).json({ error: "Invalid or missing file" });
return;
}
const text = await extractTextFromFile({
filepath: file.filepath,
filetype: file.mimetype ?? "",
});
const { meanEmbedding, chunks } = await createEmbeddings({
text,
});
res.status(200).json({ text, meanEmbedding, chunks });
} catch (error: any) {
res.status(500).json({ error: error.message });
} finally {
// Always send a response, even if it is empty
res.end();
}
}

@ -0,0 +1,59 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { searchFileChunks } from "../../services/searchFileChunks";
import { FileChunk, FileLite } from "../../types/file";
type Data = {
searchResults?: FileChunk[];
error?: string;
};
export const config = {
api: {
bodyParser: {
sizeLimit: "30mb",
},
},
};
export default async function handler(
req: NextApiRequest,
res: NextApiResponse<Data>
) {
try {
const searchQuery = req.body.searchQuery as string;
const files = req.body.files as FileLite[];
const maxResults = req.body.maxResults as number;
if (!searchQuery) {
res.status(400).json({ error: "searchQuery must be a string" });
return;
}
if (!Array.isArray(files) || files.length === 0) {
res.status(400).json({ error: "files must be a non-empty array" });
return;
}
if (!maxResults || maxResults < 1) {
res
.status(400)
.json({ error: "maxResults must be a number greater than 0" });
return;
}
const searchResults = await searchFileChunks({
searchQuery,
files,
maxResults,
});
res.status(200).json({ searchResults });
} catch (error) {
console.error(error);
res.status(500).json({ error: "Something went wrong" });
}
}

@ -0,0 +1,35 @@
import Head from "next/head";
import { useState } from "react";
import FileQandAArea from "../components/FileQandAArea";
import { FileLite } from "../types/file";
import FileUploadArea from "../components/FileUploadArea";
export default function FileQandA() {
const [files, setFiles] = useState<FileLite[]>([]);
return (
<div className="flex items-left text-left h-screen flex-col">
<Head>
<title>File Q&A</title>
</Head>
<div className="max-w-3xl mx-auto m-8 space-y-8 text-gray-800">
<h1 className="text-4xl">File Q&A</h1>
<div className="">
To search for answers from the content in your files, upload them here
and we will use OpenAI embeddings and GPT to find answers from the
relevant documents.
</div>
<FileUploadArea
handleSetFiles={setFiles}
maxNumFiles={75}
maxFileSizeMB={30}
/>
<FileQandAArea files={files} />
</div>
</div>
);
}

@ -0,0 +1,74 @@
// A function that splits a text into smaller pieces of roughly equal length
// The pieces are delimited by sentences and try to avoid breaking words or punctuation
// This can be useful for processing long texts with natural language models that have a limited input size
export function chunkText({
text, // The input text to be split
// The desired maximum length of each piece in characters
// This uses 4 characters as an approximation of the average token length
// since there isn't a good JS tokenizer at the moment
maxCharLength = 250 * 4,
}: {
text: string;
maxCharLength?: number;
}): string[] {
// Create an empty array to store the pieces
const chunks: string[] = [];
// Create a variable to hold the current piece
let currentChunk = "";
// Remove any newline characters from the text and split it by periods
// This assumes that periods mark the end of sentences, which may not be true for some languages
const sentences = text.replace(/\n/g, " ").split(/([.])/);
for (const sentence of sentences) {
// Remove any extra whitespace from the beginning and end of the sentence
const trimmedSentence = sentence.trim();
// If the sentence is empty, skip it
if (!trimmedSentence) continue;
// Check if adding the sentence to the current piece would make it too long, too short, or just right
// This uses a tolerance range of 50% of the maximum length to allow some flexibility
// If the piece is too long, save it and start a new one
// If the piece is too short, add the sentence and continue
// If the piece is just right, save it and start a new one
const chunkLength = currentChunk.length + trimmedSentence.length + 1;
const lowerBound = maxCharLength - maxCharLength * 0.5;
const upperBound = maxCharLength + maxCharLength * 0.5;
if (
chunkLength >= lowerBound &&
chunkLength <= upperBound &&
currentChunk
) {
// The piece is just right, so we save it and start a new one
// We remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, "").trim();
// We only push the piece if it is not empty
if (currentChunk) chunks.push(currentChunk);
// Reset the current piece
currentChunk = "";
} else if (chunkLength > upperBound) {
// The piece is too long, so save it and start a new one with the sentence
// Remove any periods or spaces from the beginning of the piece and trim any whitespace
currentChunk = currentChunk.replace(/^[. ]+/, "").trim();
// We only push the piece if it is not empty
if (currentChunk) chunks.push(currentChunk);
// Set the current piece to the sentence
currentChunk = trimmedSentence;
} else {
// The piece is too short, so add the sentence and continue
// Add a space before the sentence unless it is a period
currentChunk += `${trimmedSentence === "." ? "" : " "}${trimmedSentence}`;
}
}
// If there is any remaining piece, save it
if (currentChunk) {
chunks.push(currentChunk);
}
// Return the array of pieces
return chunks;
}

@ -0,0 +1,54 @@
import { TextEmbedding } from "../types/file";
import { getEmbeddingsForText } from "./getEmbeddingsForText";
export type Embeddings = {
meanEmbedding: number[];
chunks: TextEmbedding[];
};
export async function createEmbeddings({
text,
maxCharLength,
}: {
text: string;
maxCharLength?: number;
}): Promise<Embeddings> {
try {
const textEmbeddings = await getEmbeddingsForText({
text,
maxCharLength,
});
// If there are 0 or 1 embeddings, the mean embedding is the same as the embedding
if (textEmbeddings.length <= 1) {
return {
meanEmbedding: textEmbeddings[0]?.embedding ?? [],
chunks: textEmbeddings,
};
}
// If there are multiple embeddings, calculate their average
const embeddingLength = textEmbeddings[0].embedding.length;
const meanEmbedding = [];
for (let i = 0; i < embeddingLength; i++) {
// Sum up the values at the same index of each embedding
let sum = 0;
for (const textEmbedding of textEmbeddings) {
sum += textEmbedding.embedding[i];
}
// Divide by the number of embeddings to get the mean
meanEmbedding.push(sum / textEmbeddings.length);
}
return {
meanEmbedding,
chunks: textEmbeddings,
};
} catch (error: any) {
console.log("Error: ", error);
return {
meanEmbedding: [],
chunks: [],
};
}
}

@ -0,0 +1,45 @@
import fs from "fs";
import mammoth from "mammoth";
import pdfParse from "pdf-parse";
import { NodeHtmlMarkdown } from "node-html-markdown";
export default async function extractTextFromFile({
filepath,
filetype,
}: {
filepath: string;
filetype: string;
}): Promise<string> {
const buffer: Buffer = await new Promise((resolve, reject) => {
const fileStream = fs.createReadStream(filepath);
const chunks: any[] = [];
fileStream.on("data", (chunk) => {
chunks.push(chunk);
});
fileStream.on("error", (error) => {
reject(error);
});
fileStream.on("end", () => {
resolve(Buffer.concat(chunks));
});
});
// Handle different file types using different modules
switch (filetype) {
case "application/pdf":
const pdfData = await pdfParse(buffer);
return pdfData.text;
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document": // i.e. docx file
const docxResult = await mammoth.extractRawText({ path: filepath });
return docxResult.value;
case "text/markdown":
case "text/csv":
case "text/html":
const html = buffer.toString();
return NodeHtmlMarkdown.translate(html);
case "text/plain":
return buffer.toString();
default:
throw new Error("Unsupported file type");
}
}

@ -0,0 +1,42 @@
import { TextEmbedding } from "../types/file";
import { chunkText } from "./chunkText";
import { embedding } from "./openai";
// There isn't a good JS tokenizer at the moment, so we are using this approximation of 4 characters per token instead. This might break for some languages.
const MAX_CHAR_LENGTH = 250 * 4;
// This function takes a text and returns an array of embeddings for each chunk of the text
// The text is split into chunks of a given maximum charcter length
// The embeddings are computed in batches of a given size
export async function getEmbeddingsForText({
text,
maxCharLength = MAX_CHAR_LENGTH,
batchSize = 20,
}: {
text: string;
maxCharLength?: number;
batchSize?: number;
}): Promise<TextEmbedding[]> {
const textChunks = chunkText({ text, maxCharLength });
const batches = [];
for (let i = 0; i < textChunks.length; i += batchSize) {
batches.push(textChunks.slice(i, i + batchSize));
}
try {
const batchPromises = batches.map((batch) => embedding({ input: batch }));
const embeddings = (await Promise.all(batchPromises)).flat();
const textEmbeddings = embeddings.map((embedding, index) => ({
embedding,
text: textChunks[index],
}));
return textEmbeddings;
} catch (error: any) {
console.log("Error: ", error);
return [];
}
}

@ -0,0 +1,111 @@
import { IncomingMessage } from "http";
import {
Configuration,
CreateCompletionRequest,
CreateCompletionResponse,
OpenAIApi,
} from "openai";
// This file contains utility functions for interacting with the OpenAI API
if (!process.env.OPENAI_API_KEY) {
throw new Error("Missing OPENAI_API_KEY environment variable");
}
const configuration = new Configuration({
apiKey: process.env.OPENAI_API_KEY,
});
export const openai = new OpenAIApi(configuration);
type CompletionOptions = Partial<CreateCompletionRequest> & {
prompt: string;
fallback?: string;
};
type EmbeddingOptions = {
input: string | string[];
model?: string;
};
export async function completion({
prompt,
fallback,
max_tokens = 800,
temperature = 0,
model = "text-davinci-003",
...otherOptions
}: CompletionOptions) {
try {
const result = await openai.createCompletion({
prompt,
max_tokens,
temperature,
model,
...otherOptions,
});
if (!result.data.choices[0].text) {
throw new Error("No text returned from the completions endpoint.");
}
return result.data.choices[0].text;
} catch (error) {
if (fallback) return fallback;
else throw error;
}
}
export async function* completionStream({
prompt,
fallback,
max_tokens = 800,
temperature = 0,
model = "text-davinci-003",
}: CompletionOptions) {
try {
const result = await openai.createCompletion(
{
prompt,
max_tokens,
temperature,
model,
stream: true,
},
{ responseType: "stream" }
);
const stream = result.data as any as IncomingMessage;
for await (const chunk of stream) {
const line = chunk.toString().trim();
const message = line.split("data: ")[1];
if (message === "[DONE]") {
break;
}
const data = JSON.parse(message) as CreateCompletionResponse;
yield data.choices[0].text;
}
} catch (error) {
if (fallback) yield fallback;
else throw error;
}
}
export async function embedding({
input,
model = "text-embedding-ada-002",
}: EmbeddingOptions): Promise<number[][]> {
const result = await openai.createEmbedding({
model,
input,
});
if (!result.data.data[0].embedding) {
throw new Error("No embedding returned from the completions endpoint");
}
// Otherwise, return the embeddings
return result.data.data.map((d) => d.embedding);
}

@ -0,0 +1,53 @@
import { FileLite, FileChunk } from "../types/file";
import { embedding } from "./openai";
// This is the minimum cosine similarity score that a file must have with the search query to be considered relevant
// This is an arbitrary value, and you should vary/ remove this depending on the diversity of your dataset
const COSINE_SIM_THRESHOLD = 0.72;
// This function takes a search query and a list of files, and returns the chunks of text that are most semantically similar to the query
export async function searchFileChunks({
searchQuery,
files,
maxResults,
}: {
searchQuery: string;
files: FileLite[];
maxResults: number;
}): Promise<FileChunk[]> {
// Get the search query embedding
const searchQueryEmbeddingResponse = await embedding({
input: searchQuery,
});
// Get the first element in the embedding array
const searchQueryEmbedding =
searchQueryEmbeddingResponse.length > 0
? searchQueryEmbeddingResponse[0]
: [];
// Rank the chunks by their cosine similarity to the search query (using dot product since the embeddings are normalized) and return this
const rankedChunks = files
// Map each file to an array of chunks with the file name and score
.flatMap((file) =>
file.chunks
? file.chunks.map((chunk) => {
// Calculate the dot product between the chunk embedding and the search query embedding
const dotProduct = chunk.embedding.reduce(
(sum, val, i) => sum + val * searchQueryEmbedding[i],
0
);
// Assign the dot product as the score for the chunk
return { ...chunk, filename: file.name, score: dotProduct };
})
: []
)
// Sort the chunks by their scores in descending order
.sort((a, b) => b.score - a.score)
// Filter the chunks by their score above the threshold
.filter((chunk) => chunk.score > COSINE_SIM_THRESHOLD)
// Take the first maxResults chunks
.slice(0, maxResults);
return rankedChunks;
}

@ -0,0 +1,14 @@
// A function that takes a file name and a string and returns true if the file name is contained in the string
// after removing punctuation and whitespace from both
export const isFileNameInString = (fileName: string, str: string) => {
// Convert both to lowercase and remove punctuation and whitespace
const normalizedFileName = fileName
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
const normalizedStr = str
.toLowerCase()
.replace(/[.,/#!$%^&*;:{}=-_~()\s]/g, "");
// Return true if the normalized file name is included in the normalized string
return normalizedStr.includes(normalizedFileName);
};

@ -0,0 +1,5 @@
@import "./preflight.css";
@tailwind base;
@tailwind components;
@tailwind utilities;

@ -0,0 +1,368 @@
/* Using a custom preflight to fix conflicts with Ant Design */
/* Original: https://unpkg.com/tailwindcss@3.2.4/src/css/preflight.css */
/*
1. Prevent padding and border from affecting element width. (https://github.com/mozdevs/cssremedy/issues/4)
2. Allow adding a border to an element by just adding a border-width. (https://github.com/tailwindcss/tailwindcss/pull/116)
*/
*,
::before,
::after {
box-sizing: border-box; /* 1 */
border-width: 0; /* 2 */
border-style: solid; /* 2 */
border-color: theme("borderColor.DEFAULT"); /* 2 */
}
::before,
::after {
--tw-content: "";
}
/*
1. Use a consistent sensible line-height in all browsers.
2. Prevent adjustments of font size after orientation changes in iOS.
3. Use a more readable tab size.
4. Use the user's configured `sans` font-family by default.
5. Use the user's configured `sans` font-feature-settings by default.
*/
html {
line-height: 1.5; /* 1 */
-webkit-text-size-adjust: 100%; /* 2 */
-moz-tab-size: 4; /* 3 */
tab-size: 4; /* 3 */
font-family: theme("fontFamily.sans"); /* 4 */
}
/*
1. Remove the margin in all browsers.
2. Inherit line-height from `html` so users can set them as a class directly on the `html` element.
*/
body {
margin: 0; /* 1 */
line-height: inherit; /* 2 */
}
/*
1. Add the correct height in Firefox.
2. Correct the inheritance of border color in Firefox. (https://bugzilla.mozilla.org/show_bug.cgi?id=190655)
3. Ensure horizontal rules are visible by default.
*/
hr {
height: 0; /* 1 */
color: inherit; /* 2 */
border-top-width: 1px; /* 3 */
}
/*
Add the correct text decoration in Chrome, Edge, and Safari.
*/
abbr:where([title]) {
text-decoration: underline dotted;
}
/*
Remove the default font size and weight for headings.
*/
h1,
h2,
h3,
h4,
h5,
h6 {
font-size: inherit;
font-weight: inherit;
}
/*
Reset links to optimize for opt-in styling instead of opt-out.
*/
a {
color: inherit;
text-decoration: inherit;
}
/*
Add the correct font weight in Edge and Safari.
*/
b,
strong {
font-weight: bolder;
}
/*
1. Use the user's configured `mono` font family by default.
2. Correct the odd `em` font sizing in all browsers.
*/
code,
kbd,
samp,
pre {
font-family: theme("fontFamily.mono"); /* 1 */
font-size: 1em; /* 2 */
}
/*
Add the correct font size in all browsers.
*/
small {
font-size: 80%;
}
/*
Prevent `sub` and `sup` elements from affecting the line height in all browsers.
*/
sub,
sup {
font-size: 75%;
line-height: 0;
position: relative;
vertical-align: baseline;
}
sub {
bottom: -0.25em;
}
sup {
top: -0.5em;
}
/*
1. Remove text indentation from table contents in Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=999088, https://bugs.webkit.org/show_bug.cgi?id=201297)
2. Correct table border color inheritance in all Chrome and Safari. (https://bugs.chromium.org/p/chromium/issues/detail?id=935729, https://bugs.webkit.org/show_bug.cgi?id=195016)
3. Remove gaps between table borders by default.
*/
table {
text-indent: 0; /* 1 */
border-color: inherit; /* 2 */
border-collapse: collapse; /* 3 */
}
/*
1. Change the font styles in all browsers.
2. Remove the margin in Firefox and Safari.
3. Remove default padding in all browsers.
*/
button,
input,
optgroup,
select,
textarea {
font-family: inherit; /* 1 */
font-size: 100%; /* 1 */
font-weight: inherit; /* 1 */
line-height: inherit; /* 1 */
color: inherit; /* 1 */
margin: 0; /* 2 */
padding: 0; /* 3 */
}
/*
Remove the inheritance of text transform in Edge and Firefox.
*/
button,
select {
text-transform: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Remove default button styles.
*/
button,
[type="button"],
[type="reset"],
[type="submit"] {
-webkit-appearance: button; /* 1 */
background-image: none; /* 2 */
}
/*
Use the modern Firefox focus style for all focusable elements.
*/
:-moz-focusring {
outline: auto;
}
/*
Remove the additional `:invalid` styles in Firefox. (https://github.com/mozilla/gecko-dev/blob/2f9eacd9d3d995c937b4251a5557d95d494c9be1/layout/style/res/forms.css#L728-L737)
*/
:-moz-ui-invalid {
box-shadow: none;
}
/*
Add the correct vertical alignment in Chrome and Firefox.
*/
progress {
vertical-align: baseline;
}
/*
Correct the cursor style of increment and decrement buttons in Safari.
*/
::-webkit-inner-spin-button,
::-webkit-outer-spin-button {
height: auto;
}
/*
1. Correct the odd appearance in Chrome and Safari.
2. Correct the outline style in Safari.
*/
[type="search"] {
-webkit-appearance: textfield; /* 1 */
outline-offset: -2px; /* 2 */
}
/*
Remove the inner padding in Chrome and Safari on macOS.
*/
::-webkit-search-decoration {
-webkit-appearance: none;
}
/*
1. Correct the inability to style clickable types in iOS and Safari.
2. Change font properties to `inherit` in Safari.
*/
::-webkit-file-upload-button {
-webkit-appearance: button; /* 1 */
font: inherit; /* 2 */
}
/*
Add the correct display in Chrome and Safari.
*/
summary {
display: list-item;
}
/*
Removes the default spacing and border for appropriate elements.
*/
blockquote,
dl,
dd,
h1,
h2,
h3,
h4,
h5,
h6,
hr,
figure,
p,
pre {
margin: 0;
}
fieldset {
margin: 0;
padding: 0;
}
legend {
padding: 0;
}
ol,
ul,
menu {
list-style: none;
margin: 0;
padding: 0;
}
/*
Prevent resizing textareas horizontally by default.
*/
textarea {
resize: vertical;
}
/*
1. Reset the default placeholder opacity in Firefox. (https://github.com/tailwindlabs/tailwindcss/issues/3300)
2. Set the default placeholder color to the user's configured gray 400 color.
*/
input::placeholder,
textarea::placeholder {
opacity: 1; /* 1 */
color: theme("colors.gray.400"); /* 2 */
}
/*
Set the default cursor for buttons.
*/
button,
[role="button"] {
cursor: pointer;
}
/*
Make sure disabled buttons don't get the pointer cursor.
*/
:disabled {
cursor: default;
}
/*
1. Make replaced elements `display: block` by default. (https://github.com/mozdevs/cssremedy/issues/14)
2. Add `vertical-align: middle` to align replaced elements more sensibly by default. (https://github.com/jensimmons/cssremedy/issues/14#issuecomment-634934210)
This can trigger a poorly considered lint error in some tools but is included by design.
*/
img,
svg,
video,
canvas,
audio,
iframe,
embed,
object {
display: block; /* 1 */
vertical-align: middle; /* 2 */
}
/*
Constrain images and videos to the parent width and preserve their intrinsic aspect ratio. (https://github.com/mozdevs/cssremedy/issues/14)
*/
img,
video {
max-width: 100%;
height: auto;
}
/* Make elements with the HTML hidden attribute stay hidden by default */
[hidden] {
display: none;
}

@ -0,0 +1,21 @@
export interface FileLite {
expanded?: boolean;
name: string;
url?: string;
type?: string;
score?: number;
size?: number;
embedding?: number[]; // The file embedding -- or mean embedding if there are multiple embeddings for the file
chunks?: TextEmbedding[]; // The chunks of text and their embeddings
extractedText?: string; // The extracted text from the file
}
export interface FileChunk extends TextEmbedding {
filename: string;
score?: number;
}
export interface TextEmbedding {
text: string;
embedding: number[];
}

@ -0,0 +1,28 @@
const { fontFamily } = require("tailwindcss/defaultTheme");
/** @type {import('tailwindcss').Config} */
module.exports = {
content: [
"./app/**/*.{js,ts,jsx,tsx}",
"./src/**/*.{js,ts,jsx,tsx}",
"./pages/**/*.{js,ts,jsx,tsx}",
"./components/**/*.{js,ts,jsx,tsx}",
],
corePlugins: {
preflight: false,
},
theme: {
extend: {
},
},
keyframes: {
blink: {
"0%, 100%": { opacity: 1 },
"50%": { opacity: 0 },
},
},
plugins: [
require("@tailwindcss/line-clamp"),
require("@tailwindcss/typography"),
],
};

@ -0,0 +1,24 @@
{
"compilerOptions": {
"target": "es5",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"forceConsistentCasingInFileNames": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "node",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"baseUrl": ".",
"paths": {
"@/*": ["./src/*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx"],
"exclude": ["node_modules"]
}
Loading…
Cancel
Save