mirror of
https://github.com/arc53/DocsGPT
synced 2024-11-17 21:26:26 +00:00
This commit is contained in:
parent
bbf55ca46e
commit
b4bd34fb96
@ -1,2 +1,8 @@
|
||||
OPENAI_API_KEY=<LLM api key (for example, open ai key)>
|
||||
EMBEDDINGS_KEY=<LLM embeddings api key (for example, open ai key)>
|
||||
|
||||
# Azure
|
||||
OPENAI_API_BASE=
|
||||
OPENAI_API_VERSION=
|
||||
AZURE_DEPLOYMENT_NAME=
|
||||
AZURE_EMBEDDINGS_DEPLOYMENT_NAME=
|
@ -3,4 +3,8 @@ EMBEDDINGS_KEY=your_api_key
|
||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||
CELERY_RESULT_BACKEND=redis://localhost:6379/1
|
||||
MONGO_URI=mongodb://localhost:27017/docsgpt
|
||||
API_URL=http://localhost:5001
|
||||
API_URL=http://localhost:5001
|
||||
|
||||
OPENAI_API_BASE=
|
||||
OPENAI_API_VERSION=
|
||||
AZURE_DEPLOYMENT_NAME=
|
@ -5,8 +5,8 @@ import json
|
||||
import os
|
||||
import traceback
|
||||
|
||||
import openai
|
||||
import dotenv
|
||||
import openai
|
||||
import requests
|
||||
from celery import Celery
|
||||
from celery.result import AsyncResult
|
||||
@ -16,9 +16,14 @@ from langchain import VectorDBQA, HuggingFaceHub, Cohere, OpenAI
|
||||
from langchain.chains import LLMChain, ConversationalRetrievalChain
|
||||
from langchain.chains.conversational_retrieval.prompts import CONDENSE_QUESTION_PROMPT
|
||||
from langchain.chains.question_answering import load_qa_chain
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceHubEmbeddings, CohereEmbeddings, \
|
||||
HuggingFaceInstructEmbeddings
|
||||
from langchain.chat_models import ChatOpenAI, AzureChatOpenAI
|
||||
from langchain.embeddings import (
|
||||
OpenAIEmbeddings,
|
||||
HuggingFaceHubEmbeddings,
|
||||
CohereEmbeddings,
|
||||
HuggingFaceInstructEmbeddings,
|
||||
)
|
||||
from langchain.llms import GPT4All
|
||||
from langchain.prompts import PromptTemplate
|
||||
from langchain.prompts.chat import (
|
||||
ChatPromptTemplate,
|
||||
@ -28,7 +33,6 @@ from langchain.prompts.chat import (
|
||||
)
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
from langchain.llms import GPT4All
|
||||
|
||||
from core.settings import settings
|
||||
from error import bad_request
|
||||
@ -40,10 +44,7 @@ if settings.LLM_NAME == "manifest":
|
||||
from manifest import Manifest
|
||||
from langchain.llms.manifest import ManifestWrapper
|
||||
|
||||
manifest = Manifest(
|
||||
client_name="huggingface",
|
||||
client_connection="http://127.0.0.1:5000"
|
||||
)
|
||||
manifest = Manifest(client_name="huggingface", client_connection="http://127.0.0.1:5000")
|
||||
|
||||
# Redirect PosixPath to WindowsPath on Windows
|
||||
import platform
|
||||
@ -73,23 +74,17 @@ with open("prompts/chat_combine_prompt.txt", "r") as f:
|
||||
with open("prompts/chat_reduce_prompt.txt", "r") as f:
|
||||
chat_reduce_template = f.read()
|
||||
|
||||
if settings.API_KEY is not None:
|
||||
api_key_set = True
|
||||
else:
|
||||
api_key_set = False
|
||||
if settings.EMBEDDINGS_KEY is not None:
|
||||
embeddings_key_set = True
|
||||
else:
|
||||
embeddings_key_set = False
|
||||
api_key_set = settings.API_KEY is not None
|
||||
embeddings_key_set = settings.EMBEDDINGS_KEY is not None
|
||||
|
||||
app = Flask(__name__)
|
||||
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER = "inputs"
|
||||
app.config['CELERY_BROKER_URL'] = settings.CELERY_BROKER_URL
|
||||
app.config['CELERY_RESULT_BACKEND'] = settings.CELERY_RESULT_BACKEND
|
||||
app.config['MONGO_URI'] = settings.MONGO_URI
|
||||
app.config["UPLOAD_FOLDER"] = UPLOAD_FOLDER = "inputs"
|
||||
app.config["CELERY_BROKER_URL"] = settings.CELERY_BROKER_URL
|
||||
app.config["CELERY_RESULT_BACKEND"] = settings.CELERY_RESULT_BACKEND
|
||||
app.config["MONGO_URI"] = settings.MONGO_URI
|
||||
celery = Celery()
|
||||
celery.config_from_object('celeryconfig')
|
||||
mongo = MongoClient(app.config['MONGO_URI'])
|
||||
celery.config_from_object("celeryconfig")
|
||||
mongo = MongoClient(app.config["MONGO_URI"])
|
||||
db = mongo["docsgpt"]
|
||||
vectors_collection = db["vectors"]
|
||||
|
||||
@ -120,12 +115,13 @@ def get_vectorstore(data):
|
||||
vectorstore = "indexes/" + data["active_docs"]
|
||||
else:
|
||||
vectorstore = "vectors/" + data["active_docs"]
|
||||
if data['active_docs'] == "default":
|
||||
if data["active_docs"] == "default":
|
||||
vectorstore = ""
|
||||
else:
|
||||
vectorstore = ""
|
||||
return vectorstore
|
||||
|
||||
|
||||
def get_docsearch(vectorstore, embeddings_key):
|
||||
if settings.EMBEDDINGS_NAME == "openai_text-embedding-ada-002":
|
||||
docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key))
|
||||
@ -146,8 +142,10 @@ def ingest(self, directory, formats, name_job, filename, user):
|
||||
|
||||
@app.route("/")
|
||||
def home():
|
||||
return render_template("index.html", api_key_set=api_key_set, llm_choice=settings.LLM_NAME,
|
||||
embeddings_choice=settings.EMBEDDINGS_NAME)
|
||||
return render_template(
|
||||
"index.html", api_key_set=api_key_set, llm_choice=settings.LLM_NAME, embeddings_choice=settings.EMBEDDINGS_NAME
|
||||
)
|
||||
|
||||
|
||||
def complete_stream(question, docsearch, chat_history, api_key):
|
||||
openai.api_key = api_key
|
||||
@ -169,22 +167,25 @@ def complete_stream(question, docsearch, chat_history, api_key):
|
||||
messages_combine.append({"role": "user", "content": i["prompt"]})
|
||||
messages_combine.append({"role": "system", "content": i["response"]})
|
||||
messages_combine.append({"role": "user", "content": question})
|
||||
completion = openai.ChatCompletion.create(model="gpt-3.5-turbo",
|
||||
messages=messages_combine, stream=True, max_tokens=1000, temperature=0)
|
||||
completion = openai.ChatCompletion.create(
|
||||
model="gpt-3.5-turbo", messages=messages_combine, stream=True, max_tokens=1000, temperature=0
|
||||
)
|
||||
|
||||
for line in completion:
|
||||
if 'content' in line['choices'][0]['delta']:
|
||||
if "content" in line["choices"][0]["delta"]:
|
||||
# check if the delta contains content
|
||||
data = json.dumps({"answer": str(line['choices'][0]['delta']['content'])})
|
||||
data = json.dumps({"answer": str(line["choices"][0]["delta"]["content"])})
|
||||
yield f"data: {data}\n\n"
|
||||
# send data.type = "end" to indicate that the stream has ended as json
|
||||
data = json.dumps({"type": "end"})
|
||||
yield f"data: {data}\n\n"
|
||||
@app.route("/stream", methods=['POST', 'GET'])
|
||||
|
||||
|
||||
@app.route("/stream", methods=["POST", "GET"])
|
||||
def stream():
|
||||
# get parameter from url question
|
||||
question = request.args.get('question')
|
||||
history = request.args.get('history')
|
||||
question = request.args.get("question")
|
||||
history = request.args.get("history")
|
||||
# history to json object from string
|
||||
history = json.loads(history)
|
||||
|
||||
@ -204,10 +205,10 @@ def stream():
|
||||
vectorstore = ""
|
||||
docsearch = get_docsearch(vectorstore, embeddings_key)
|
||||
|
||||
|
||||
#question = "Hi"
|
||||
return Response(complete_stream(question, docsearch,
|
||||
chat_history= history, api_key=api_key), mimetype='text/event-stream')
|
||||
# question = "Hi"
|
||||
return Response(
|
||||
complete_stream(question, docsearch, chat_history=history, api_key=api_key), mimetype="text/event-stream"
|
||||
)
|
||||
|
||||
|
||||
@app.route("/api/answer", methods=["POST"])
|
||||
@ -215,7 +216,7 @@ def api_answer():
|
||||
data = request.get_json()
|
||||
question = data["question"]
|
||||
history = data["history"]
|
||||
print('-' * 5)
|
||||
print("-" * 5)
|
||||
if not api_key_set:
|
||||
api_key = data["api_key"]
|
||||
else:
|
||||
@ -233,14 +234,23 @@ def api_answer():
|
||||
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
|
||||
docsearch = get_docsearch(vectorstore, embeddings_key)
|
||||
|
||||
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
|
||||
template_format="jinja2")
|
||||
q_prompt = PromptTemplate(
|
||||
input_variables=["context", "question"], template=template_quest, template_format="jinja2"
|
||||
)
|
||||
if settings.LLM_NAME == "openai_chat":
|
||||
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
|
||||
if settings.OPENAI_API_BASE and settings.OPENAI_API_VERSION and settings.AZURE_DEPLOYMENT_NAME: # azure
|
||||
llm = AzureChatOpenAI(
|
||||
openai_api_key=api_key,
|
||||
openai_api_base=settings.OPENAI_API_BASE,
|
||||
openai_api_version=settings.OPENAI_API_VERSION,
|
||||
deployment_name=settings.AZURE_DEPLOYMENT_NAME,
|
||||
)
|
||||
else:
|
||||
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
|
||||
messages_combine = [SystemMessagePromptTemplate.from_template(chat_combine_template)]
|
||||
if history:
|
||||
tokens_current_history = 0
|
||||
#count tokens in history
|
||||
# count tokens in history
|
||||
history.reverse()
|
||||
for i in history:
|
||||
if "prompt" in i and "response" in i:
|
||||
@ -251,6 +261,7 @@ def api_answer():
|
||||
messages_combine.append(AIMessagePromptTemplate.from_template(i["response"]))
|
||||
messages_combine.append(HumanMessagePromptTemplate.from_template("{question}"))
|
||||
import sys
|
||||
|
||||
print(messages_combine, file=sys.stderr)
|
||||
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
|
||||
elif settings.LLM_NAME == "openai":
|
||||
@ -292,8 +303,9 @@ def api_answer():
|
||||
result = run_async_chain(chain, question, chat_history)
|
||||
|
||||
else:
|
||||
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
|
||||
combine_prompt=chat_combine_template, question_prompt=q_prompt)
|
||||
qa_chain = load_qa_chain(
|
||||
llm=llm, chain_type="map_reduce", combine_prompt=chat_combine_template, question_prompt=q_prompt
|
||||
)
|
||||
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=3)
|
||||
result = chain({"query": question})
|
||||
|
||||
@ -301,10 +313,10 @@ def api_answer():
|
||||
|
||||
# some formatting for the frontend
|
||||
if "result" in result:
|
||||
result['answer'] = result['result']
|
||||
result['answer'] = result['answer'].replace("\\n", "\n")
|
||||
result["answer"] = result["result"]
|
||||
result["answer"] = result["answer"].replace("\\n", "\n")
|
||||
try:
|
||||
result['answer'] = result['answer'].split("SOURCES:")[0]
|
||||
result["answer"] = result["answer"].split("SOURCES:")[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@ -327,16 +339,16 @@ def check_docs():
|
||||
data = request.get_json()
|
||||
# split docs on / and take first part
|
||||
if data["docs"].split("/")[0] == "local":
|
||||
return {"status": 'exists'}
|
||||
return {"status": "exists"}
|
||||
vectorstore = "vectors/" + data["docs"]
|
||||
base_path = 'https://raw.githubusercontent.com/arc53/DocsHUB/main/'
|
||||
base_path = "https://raw.githubusercontent.com/arc53/DocsHUB/main/"
|
||||
if os.path.exists(vectorstore) or data["docs"] == "default":
|
||||
return {"status": 'exists'}
|
||||
return {"status": "exists"}
|
||||
else:
|
||||
r = requests.get(base_path + vectorstore + "index.faiss")
|
||||
|
||||
if r.status_code != 200:
|
||||
return {"status": 'null'}
|
||||
return {"status": "null"}
|
||||
else:
|
||||
if not os.path.exists(vectorstore):
|
||||
os.makedirs(vectorstore)
|
||||
@ -348,7 +360,7 @@ def check_docs():
|
||||
with open(vectorstore + "index.pkl", "wb") as f:
|
||||
f.write(r.content)
|
||||
|
||||
return {"status": 'loaded'}
|
||||
return {"status": "loaded"}
|
||||
|
||||
|
||||
@app.route("/api/feedback", methods=["POST"])
|
||||
@ -358,187 +370,190 @@ def api_feedback():
|
||||
answer = data["answer"]
|
||||
feedback = data["feedback"]
|
||||
|
||||
print('-' * 5)
|
||||
print("-" * 5)
|
||||
print("Question: " + question)
|
||||
print("Answer: " + answer)
|
||||
print("Feedback: " + feedback)
|
||||
print('-' * 5)
|
||||
print("-" * 5)
|
||||
response = requests.post(
|
||||
url="https://86x89umx77.execute-api.eu-west-2.amazonaws.com/docsgpt-feedback",
|
||||
headers={
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
},
|
||||
data=json.dumps({
|
||||
"answer": answer,
|
||||
"question": question,
|
||||
"feedback": feedback
|
||||
})
|
||||
data=json.dumps({"answer": answer, "question": question, "feedback": feedback}),
|
||||
)
|
||||
return {"status": http.client.responses.get(response.status_code, 'ok')}
|
||||
return {"status": http.client.responses.get(response.status_code, "ok")}
|
||||
|
||||
|
||||
@app.route('/api/combine', methods=['GET'])
|
||||
@app.route("/api/combine", methods=["GET"])
|
||||
def combined_json():
|
||||
user = 'local'
|
||||
user = "local"
|
||||
"""Provide json file with combined available indexes."""
|
||||
# get json from https://d3dg1063dc54p9.cloudfront.net/combined.json
|
||||
|
||||
data = [{
|
||||
"name": 'default',
|
||||
"language": 'default',
|
||||
"version": '',
|
||||
"description": 'default',
|
||||
"fullName": 'default',
|
||||
"date": 'default',
|
||||
"docLink": 'default',
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local"
|
||||
}]
|
||||
data = [
|
||||
{
|
||||
"name": "default",
|
||||
"language": "default",
|
||||
"version": "",
|
||||
"description": "default",
|
||||
"fullName": "default",
|
||||
"date": "default",
|
||||
"docLink": "default",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
}
|
||||
]
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
# append data from vectors_collection
|
||||
for index in vectors_collection.find({'user': user}):
|
||||
data.append({
|
||||
"name": index['name'],
|
||||
"language": index['language'],
|
||||
"version": '',
|
||||
"description": index['name'],
|
||||
"fullName": index['name'],
|
||||
"date": index['date'],
|
||||
"docLink": index['location'],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local"
|
||||
})
|
||||
for index in vectors_collection.find({"user": user}):
|
||||
data.append(
|
||||
{
|
||||
"name": index["name"],
|
||||
"language": index["language"],
|
||||
"version": "",
|
||||
"description": index["name"],
|
||||
"fullName": index["name"],
|
||||
"date": index["date"],
|
||||
"docLink": index["location"],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
}
|
||||
)
|
||||
|
||||
data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json()
|
||||
for index in data_remote:
|
||||
index['location'] = "remote"
|
||||
index["location"] = "remote"
|
||||
data.append(index)
|
||||
|
||||
return jsonify(data)
|
||||
|
||||
|
||||
@app.route('/api/upload', methods=['POST'])
|
||||
@app.route("/api/upload", methods=["POST"])
|
||||
def upload_file():
|
||||
"""Upload a file to get vectorized and indexed."""
|
||||
if 'user' not in request.form:
|
||||
return {"status": 'no user'}
|
||||
user = secure_filename(request.form['user'])
|
||||
if 'name' not in request.form:
|
||||
return {"status": 'no name'}
|
||||
job_name = secure_filename(request.form['name'])
|
||||
if "user" not in request.form:
|
||||
return {"status": "no user"}
|
||||
user = secure_filename(request.form["user"])
|
||||
if "name" not in request.form:
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
# check if the post request has the file part
|
||||
if 'file' not in request.files:
|
||||
print('No file part')
|
||||
return {"status": 'no file'}
|
||||
file = request.files['file']
|
||||
if file.filename == '':
|
||||
return {"status": 'no file name'}
|
||||
if "file" not in request.files:
|
||||
print("No file part")
|
||||
return {"status": "no file"}
|
||||
file = request.files["file"]
|
||||
if file.filename == "":
|
||||
return {"status": "no file name"}
|
||||
|
||||
if file:
|
||||
filename = secure_filename(file.filename)
|
||||
# save dir
|
||||
save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name)
|
||||
save_dir = os.path.join(app.config["UPLOAD_FOLDER"], user, job_name)
|
||||
# create dir if not exists
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
|
||||
file.save(os.path.join(save_dir, filename))
|
||||
task = ingest.delay('temp', [".rst", ".md", ".pdf", ".txt"], job_name, filename, user)
|
||||
task = ingest.delay("temp", [".rst", ".md", ".pdf", ".txt"], job_name, filename, user)
|
||||
# task id
|
||||
task_id = task.id
|
||||
return {"status": 'ok', "task_id": task_id}
|
||||
return {"status": "ok", "task_id": task_id}
|
||||
else:
|
||||
return {"status": 'error'}
|
||||
return {"status": "error"}
|
||||
|
||||
|
||||
@app.route('/api/task_status', methods=['GET'])
|
||||
@app.route("/api/task_status", methods=["GET"])
|
||||
def task_status():
|
||||
"""Get celery job status."""
|
||||
task_id = request.args.get('task_id')
|
||||
task_id = request.args.get("task_id")
|
||||
task = AsyncResult(task_id)
|
||||
task_meta = task.info
|
||||
return {"status": task.status, "result": task_meta}
|
||||
|
||||
|
||||
### Backgound task api
|
||||
@app.route('/api/upload_index', methods=['POST'])
|
||||
@app.route("/api/upload_index", methods=["POST"])
|
||||
def upload_index_files():
|
||||
"""Upload two files(index.faiss, index.pkl) to the user's folder."""
|
||||
if 'user' not in request.form:
|
||||
return {"status": 'no user'}
|
||||
user = secure_filename(request.form['user'])
|
||||
if 'name' not in request.form:
|
||||
return {"status": 'no name'}
|
||||
job_name = secure_filename(request.form['name'])
|
||||
if 'file_faiss' not in request.files:
|
||||
print('No file part')
|
||||
return {"status": 'no file'}
|
||||
file_faiss = request.files['file_faiss']
|
||||
if file_faiss.filename == '':
|
||||
return {"status": 'no file name'}
|
||||
if 'file_pkl' not in request.files:
|
||||
print('No file part')
|
||||
return {"status": 'no file'}
|
||||
file_pkl = request.files['file_pkl']
|
||||
if file_pkl.filename == '':
|
||||
return {"status": 'no file name'}
|
||||
if "user" not in request.form:
|
||||
return {"status": "no user"}
|
||||
user = secure_filename(request.form["user"])
|
||||
if "name" not in request.form:
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
if "file_faiss" not in request.files:
|
||||
print("No file part")
|
||||
return {"status": "no file"}
|
||||
file_faiss = request.files["file_faiss"]
|
||||
if file_faiss.filename == "":
|
||||
return {"status": "no file name"}
|
||||
if "file_pkl" not in request.files:
|
||||
print("No file part")
|
||||
return {"status": "no file"}
|
||||
file_pkl = request.files["file_pkl"]
|
||||
if file_pkl.filename == "":
|
||||
return {"status": "no file name"}
|
||||
|
||||
# saves index files
|
||||
save_dir = os.path.join('indexes', user, job_name)
|
||||
save_dir = os.path.join("indexes", user, job_name)
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
file_faiss.save(os.path.join(save_dir, 'index.faiss'))
|
||||
file_pkl.save(os.path.join(save_dir, 'index.pkl'))
|
||||
file_faiss.save(os.path.join(save_dir, "index.faiss"))
|
||||
file_pkl.save(os.path.join(save_dir, "index.pkl"))
|
||||
# create entry in vectors_collection
|
||||
vectors_collection.insert_one({
|
||||
"user": user,
|
||||
"name": job_name,
|
||||
"language": job_name,
|
||||
"location": save_dir,
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"type": "local"
|
||||
})
|
||||
return {"status": 'ok'}
|
||||
vectors_collection.insert_one(
|
||||
{
|
||||
"user": user,
|
||||
"name": job_name,
|
||||
"language": job_name,
|
||||
"location": save_dir,
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"type": "local",
|
||||
}
|
||||
)
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.route('/api/download', methods=['get'])
|
||||
@app.route("/api/download", methods=["get"])
|
||||
def download_file():
|
||||
user = secure_filename(request.args.get('user'))
|
||||
job_name = secure_filename(request.args.get('name'))
|
||||
filename = secure_filename(request.args.get('file'))
|
||||
save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name)
|
||||
user = secure_filename(request.args.get("user"))
|
||||
job_name = secure_filename(request.args.get("name"))
|
||||
filename = secure_filename(request.args.get("file"))
|
||||
save_dir = os.path.join(app.config["UPLOAD_FOLDER"], user, job_name)
|
||||
return send_from_directory(save_dir, filename, as_attachment=True)
|
||||
|
||||
|
||||
@app.route('/api/delete_old', methods=['get'])
|
||||
@app.route("/api/delete_old", methods=["get"])
|
||||
def delete_old():
|
||||
"""Delete old indexes."""
|
||||
import shutil
|
||||
path = request.args.get('path')
|
||||
dirs = path.split('/')
|
||||
|
||||
path = request.args.get("path")
|
||||
dirs = path.split("/")
|
||||
dirs_clean = []
|
||||
for i in range(1, len(dirs)):
|
||||
dirs_clean.append(secure_filename(dirs[i]))
|
||||
# check that path strats with indexes or vectors
|
||||
if dirs[0] not in ['indexes', 'vectors']:
|
||||
return {"status": 'error'}
|
||||
path_clean = '/'.join(dirs)
|
||||
vectors_collection.delete_one({'location': path})
|
||||
if dirs[0] not in ["indexes", "vectors"]:
|
||||
return {"status": "error"}
|
||||
path_clean = "/".join(dirs)
|
||||
vectors_collection.delete_one({"location": path})
|
||||
try:
|
||||
shutil.rmtree(path_clean)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return {"status": 'ok'}
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
# handling CORS
|
||||
@app.after_request
|
||||
def after_request(response):
|
||||
response.headers.add('Access-Control-Allow-Origin', '*')
|
||||
response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization')
|
||||
response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS')
|
||||
response.headers.add('Access-Control-Allow-Credentials', 'true')
|
||||
response.headers.add("Access-Control-Allow-Origin", "*")
|
||||
response.headers.add("Access-Control-Allow-Headers", "Content-Type,Authorization")
|
||||
response.headers.add("Access-Control-Allow-Methods", "GET,PUT,POST,DELETE,OPTIONS")
|
||||
response.headers.add("Access-Control-Allow-Credentials", "true")
|
||||
return response
|
||||
|
||||
|
||||
|
@ -16,6 +16,9 @@ class Settings(BaseSettings):
|
||||
|
||||
API_KEY: str = None # LLM api key
|
||||
EMBEDDINGS_KEY: str = None # api key for embeddings (if using openai, just copy API_KEY
|
||||
OPENAI_API_BASE: str = "" # azure openai api base url
|
||||
OPENAI_API_VERSION: str = "" # azure openai api version
|
||||
AZURE_DEPLOYMENT_NAME: str = "" # azure deployment name
|
||||
|
||||
|
||||
path = Path(__file__).parent.parent.absolute()
|
||||
|
@ -19,6 +19,9 @@ services:
|
||||
- CELERY_BROKER_URL=redis://redis:6379/0
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/1
|
||||
- MONGO_URI=mongodb://mongo:27017/docsgpt
|
||||
- OPENAI_API_BASE=$OPENAI_API_BASE
|
||||
- OPENAI_API_VERSION=$OPENAI_API_VERSION
|
||||
- AZURE_DEPLOYMENT_NAME=$AZURE_DEPLOYMENT_NAME
|
||||
ports:
|
||||
- "5001:5001"
|
||||
volumes:
|
||||
@ -39,6 +42,9 @@ services:
|
||||
- CELERY_RESULT_BACKEND=redis://redis:6379/1
|
||||
- MONGO_URI=mongodb://mongo:27017/docsgpt
|
||||
- API_URL=http://backend:5001
|
||||
- OPENAI_API_BASE=$OPENAI_API_BASE
|
||||
- OPENAI_API_VERSION=$OPENAI_API_VERSION
|
||||
- AZURE_DEPLOYMENT_NAME=$AZURE_DEPLOYMENT_NAME
|
||||
depends_on:
|
||||
- redis
|
||||
- mongo
|
||||
|
@ -11,11 +11,11 @@ from retry import retry
|
||||
# from langchain.embeddings import CohereEmbeddings
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> tuple[int, float]:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = ((num_tokens / 1000) * 0.0004)
|
||||
total_price = (num_tokens / 1000) * 0.0004
|
||||
return num_tokens, total_price
|
||||
|
||||
|
||||
@ -33,6 +33,7 @@ def call_openai_api(docs, folder_name):
|
||||
os.makedirs(f"outputs/{folder_name}")
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
docs_test = [docs[0]]
|
||||
# remove the first element from docs
|
||||
docs.pop(0)
|
||||
@ -44,15 +45,25 @@ def call_openai_api(docs, folder_name):
|
||||
# environment="us-east1-gcp" # next to api key in console
|
||||
# )
|
||||
# index_name = "pandas"
|
||||
store = FAISS.from_documents(docs_test, OpenAIEmbeddings())
|
||||
if ( # azure
|
||||
os.environ.get("OPENAI_API_BASE")
|
||||
and os.environ.get("OPENAI_API_VERSION")
|
||||
and os.environ.get("AZURE_DEPLOYMENT_NAME")
|
||||
):
|
||||
os.environ["OPENAI_API_TYPE"] = "azure"
|
||||
openai_embeddings = OpenAIEmbeddings(model=os.environ.get("AZURE_EMBEDDINGS_DEPLOYMENT_NAME"))
|
||||
else:
|
||||
openai_embeddings = OpenAIEmbeddings()
|
||||
store = FAISS.from_documents(docs_test, openai_embeddings)
|
||||
# store_pine = Pinecone.from_documents(docs_test, OpenAIEmbeddings(), index_name=index_name)
|
||||
|
||||
# Uncomment for MPNet embeddings
|
||||
# model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
# hf = HuggingFaceEmbeddings(model_name=model_name)
|
||||
# store = FAISS.from_documents(docs_test, hf)
|
||||
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
|
||||
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
|
||||
for i in tqdm(
|
||||
docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format="{l_bar}{bar}| Time Left: {remaining}"
|
||||
):
|
||||
try:
|
||||
store_add_texts_with_retry(store, i)
|
||||
except Exception as e:
|
||||
|
Loading…
Reference in New Issue
Block a user