From 43622e7ab145c826b65786217195458c77cc35ee Mon Sep 17 00:00:00 2001 From: Serj Date: Sat, 29 Apr 2023 15:40:55 +0100 Subject: [PATCH 1/8] Added settings file --- application/core/__init__.py | 0 application/core/settings.py | 10 ++++++++++ 2 files changed, 10 insertions(+) create mode 100644 application/core/__init__.py create mode 100644 application/core/settings.py diff --git a/application/core/__init__.py b/application/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/application/core/settings.py b/application/core/settings.py new file mode 100644 index 0000000..416b903 --- /dev/null +++ b/application/core/settings.py @@ -0,0 +1,10 @@ +from pydantic import BaseSettings +from pathlib import Path + + +class Settings(BaseSettings): + openai_token: str + + +path = Path(__file__).parent.parent.absolute() +settings = Settings(_env_file=path.joinpath(".env"), _env_file_encoding="utf-8") From c9d24b8f42053ceebf9d42a9da7d8233a60a53cf Mon Sep 17 00:00:00 2001 From: Serj Date: Sat, 29 Apr 2023 15:44:47 +0100 Subject: [PATCH 2/8] Added llm model variable --- application/app.py | 22 +++++++++------------- application/core/settings.py | 1 + 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/application/app.py b/application/app.py index 5defcc7..8419cf5 100644 --- a/application/app.py +++ b/application/app.py @@ -28,21 +28,17 @@ from werkzeug.utils import secure_filename from error import bad_request from worker import ingest_worker +from core.settings import settings import celeryconfig # os.environ["LANGCHAIN_HANDLER"] = "langchain" -if os.getenv("LLM_NAME") is not None: - llm_choice = os.getenv("LLM_NAME") -else: - llm_choice = "openai_chat" - if os.getenv("EMBEDDINGS_NAME") is not None: embeddings_choice = os.getenv("EMBEDDINGS_NAME") else: embeddings_choice = "openai_text-embedding-ada-002" -if llm_choice == "manifest": +if settings.LLM_NAME == "manifest": from manifest import Manifest from langchain.llms.manifest import ManifestWrapper @@ -122,7 +118,7 @@ def ingest(self, directory, formats, name_job, filename, user): @app.route("/") def home(): - return render_template("index.html", api_key_set=api_key_set, llm_choice=llm_choice, + return render_template("index.html", api_key_set=api_key_set, llm_choice=settings.LLM_NAME, embeddings_choice=embeddings_choice) @@ -182,7 +178,7 @@ def api_answer(): q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest, template_format="jinja2") - if llm_choice == "openai_chat": + if settings.LLM_NAME == "openai_chat": # llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4") llm = ChatOpenAI(openai_api_key=api_key) messages_combine = [ @@ -195,16 +191,16 @@ def api_answer(): HumanMessagePromptTemplate.from_template("{question}") ] p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce) - elif llm_choice == "openai": + elif settings.LLM_NAME == "openai": llm = OpenAI(openai_api_key=api_key, temperature=0) - elif llm_choice == "manifest": + elif settings.LLM_NAME == "manifest": llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048}) - elif llm_choice == "huggingface": + elif settings.LLM_NAME == "huggingface": llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key) - elif llm_choice == "cohere": + elif settings.LLM_NAME == "cohere": llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key) - if llm_choice == "openai_chat": + if settings.LLM_NAME == "openai_chat": question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT) doc_chain = load_qa_chain(llm, chain_type="map_reduce", combine_prompt=p_chat_combine) chain = ConversationalRetrievalChain( diff --git a/application/core/settings.py b/application/core/settings.py index 416b903..bb5063f 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -3,6 +3,7 @@ from pathlib import Path class Settings(BaseSettings): + LLM_NAME: str = "openai_chat" openai_token: str From b723e14d98008084612a12d25522b477037f891a Mon Sep 17 00:00:00 2001 From: Serj Date: Sat, 29 Apr 2023 15:46:09 +0100 Subject: [PATCH 3/8] Added embeddings name variable --- application/app.py | 21 ++++++++------------- application/core/settings.py | 1 + 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/application/app.py b/application/app.py index 8419cf5..1185b1a 100644 --- a/application/app.py +++ b/application/app.py @@ -33,11 +33,6 @@ import celeryconfig # os.environ["LANGCHAIN_HANDLER"] = "langchain" -if os.getenv("EMBEDDINGS_NAME") is not None: - embeddings_choice = os.getenv("EMBEDDINGS_NAME") -else: - embeddings_choice = "openai_text-embedding-ada-002" - if settings.LLM_NAME == "manifest": from manifest import Manifest from langchain.llms.manifest import ManifestWrapper @@ -119,7 +114,7 @@ def ingest(self, directory, formats, name_job, filename, user): @app.route("/") def home(): return render_template("index.html", api_key_set=api_key_set, llm_choice=settings.LLM_NAME, - embeddings_choice=embeddings_choice) + embeddings_choice=settings.EMBEDDINGS_NAME) @app.route("/api/answer", methods=["POST"]) @@ -156,13 +151,13 @@ def api_answer(): # vectorstore = "outputs/inputs/" # loading the index and the store and the prompt template # Note if you have used other embeddings than OpenAI, you need to change the embeddings - if embeddings_choice == "openai_text-embedding-ada-002": + if settings.EMBEDDINGS_NAME == "openai_text-embedding-ada-002": docsearch = FAISS.load_local(vectorstore, OpenAIEmbeddings(openai_api_key=embeddings_key)) - elif embeddings_choice == "huggingface_sentence-transformers/all-mpnet-base-v2": + elif settings.EMBEDDINGS_NAME == "huggingface_sentence-transformers/all-mpnet-base-v2": docsearch = FAISS.load_local(vectorstore, HuggingFaceHubEmbeddings()) - elif embeddings_choice == "huggingface_hkunlp/instructor-large": + elif settings.EMBEDDINGS_NAME == "huggingface_hkunlp/instructor-large": docsearch = FAISS.load_local(vectorstore, HuggingFaceInstructEmbeddings()) - elif embeddings_choice == "cohere_medium": + elif settings.EMBEDDINGS_NAME == "cohere_medium": docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key)) # create a prompt template @@ -312,7 +307,7 @@ def combined_json(): "fullName": 'default', "date": 'default', "docLink": 'default', - "model": embeddings_choice, + "model": settings.EMBEDDINGS_NAME, "location": "local" }] # structure: name, language, version, description, fullName, date, docLink @@ -326,7 +321,7 @@ def combined_json(): "fullName": index['name'], "date": index['date'], "docLink": index['location'], - "model": embeddings_choice, + "model": settings.EMBEDDINGS_NAME, "location": "local" }) @@ -417,7 +412,7 @@ def upload_index_files(): "language": job_name, "location": save_dir, "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), - "model": embeddings_choice, + "model": settings.EMBEDDINGS_NAME, "type": "local" }) return {"status": 'ok'} diff --git a/application/core/settings.py b/application/core/settings.py index bb5063f..de1cbab 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -4,6 +4,7 @@ from pathlib import Path class Settings(BaseSettings): LLM_NAME: str = "openai_chat" + EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002" openai_token: str From 47e5d5684ace8edd27172b5ad7d72d2ddb2bdc53 Mon Sep 17 00:00:00 2001 From: Serj Date: Sat, 29 Apr 2023 15:50:02 +0100 Subject: [PATCH 4/8] Replace other env variables in the file --- application/app.py | 14 +++++++------- application/core/settings.py | 6 ++++++ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/application/app.py b/application/app.py index 1185b1a..e850544 100644 --- a/application/app.py +++ b/application/app.py @@ -70,20 +70,20 @@ with open("prompts/chat_combine_prompt.txt", "r") as f: with open("prompts/chat_reduce_prompt.txt", "r") as f: chat_reduce_template = f.read() -if os.getenv("API_KEY") is not None: +if settings.API_KEY is not None: api_key_set = True else: api_key_set = False -if os.getenv("EMBEDDINGS_KEY") is not None: +if settings.EMBEDDINGS_KEY is not None: embeddings_key_set = True else: embeddings_key_set = False app = Flask(__name__) app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER = "inputs" -app.config['CELERY_BROKER_URL'] = os.getenv("CELERY_BROKER_URL") -app.config['CELERY_RESULT_BACKEND'] = os.getenv("CELERY_RESULT_BACKEND") -app.config['MONGO_URI'] = os.getenv("MONGO_URI") +app.config['CELERY_BROKER_URL'] = settings.CELERY_BROKER_URL +app.config['CELERY_RESULT_BACKEND'] = settings.CELERY_RESULT_BACKEND +app.config['MONGO_URI'] = settings.MONGO_URI celery = Celery() celery.config_from_object('celeryconfig') mongo = MongoClient(app.config['MONGO_URI']) @@ -126,11 +126,11 @@ def api_answer(): if not api_key_set: api_key = data["api_key"] else: - api_key = os.getenv("API_KEY") + api_key = settings.API_KEY if not embeddings_key_set: embeddings_key = data["embeddings_key"] else: - embeddings_key = os.getenv("EMBEDDINGS_KEY") + embeddings_key = settings.EMBEDDINGS_KEY # use try and except to check for exception try: diff --git a/application/core/settings.py b/application/core/settings.py index de1cbab..6106aa9 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -6,6 +6,12 @@ class Settings(BaseSettings): LLM_NAME: str = "openai_chat" EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002" openai_token: str + CELERY_BROKER_URL: str + CELERY_RESULT_BACKEND: str + MONGO_URI: str + + API_KEY: str = None + EMBEDDINGS_KEY: str = None path = Path(__file__).parent.parent.absolute() From 2d92e95c8a251310ed549631c99f34cc1810c44a Mon Sep 17 00:00:00 2001 From: Serj Date: Sat, 29 Apr 2023 15:56:32 +0100 Subject: [PATCH 5/8] Added settings usage to the worker --- application/core/settings.py | 3 ++- application/worker.py | 37 +++++++++++++++--------------------- 2 files changed, 17 insertions(+), 23 deletions(-) diff --git a/application/core/settings.py b/application/core/settings.py index 6106aa9..55c9400 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -5,11 +5,12 @@ from pathlib import Path class Settings(BaseSettings): LLM_NAME: str = "openai_chat" EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002" - openai_token: str CELERY_BROKER_URL: str CELERY_RESULT_BACKEND: str MONGO_URI: str + API_URL: str = "http://localhost:5001" + API_KEY: str = None EMBEDDINGS_KEY: str = None diff --git a/application/worker.py b/application/worker.py index ce2c82d..c64c6ee 100644 --- a/application/worker.py +++ b/application/worker.py @@ -6,7 +6,7 @@ from parser.file.bulk import SimpleDirectoryReader from parser.schema.base import Document from parser.open_ai_func import call_openai_api from parser.token_func import group_split -from celery import current_task +from application.core.settings import settings import string @@ -18,11 +18,12 @@ try: nltk.download('averaged_perceptron_tagger', quiet=True) except FileExistsError: pass + + def generate_random_string(length): return ''.join([string.ascii_letters[i % 52] for i in range(length)]) - def ingest_worker(self, directory, formats, name_job, filename, user): # directory = 'inputs' or 'temp' # formats = [".rst", ".md"] @@ -39,12 +40,8 @@ def ingest_worker(self, directory, formats, name_job, filename, user): max_tokens = 1250 full_path = directory + '/' + user + '/' + name_job # check if API_URL env variable is set - if not os.environ.get('API_URL'): - url = 'http://localhost:5001/api/download' - else: - url = os.environ.get('API_URL') + '/api/download' file_data = {'name': name_job, 'file': filename, 'user': user} - response = requests.get(url, params=file_data) + response = requests.get(os.path.join(settings.API_URL, "/api/download"), params=file_data) file = response.content if not os.path.exists(full_path): @@ -58,8 +55,6 @@ def ingest_worker(self, directory, formats, name_job, filename, user): zip_ref.extractall(full_path) os.remove(full_path + '/' + filename) - - import time self.update_state(state='PROGRESS', meta={'current': 1}) raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, @@ -78,22 +73,20 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - if not os.environ.get('API_URL'): - url = 'http://localhost:5001/api/upload_index' - else: - url = os.environ.get('API_URL') + '/api/upload_index' file_data = {'name': name_job, 'user': user} files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), 'file_pkl': open(full_path + '/index.pkl', 'rb')} - response = requests.post(url, files=files, data=file_data) - - #deletes remote - if not os.environ.get('API_URL'): - url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job - else: - url = os.environ.get('API_URL') + '/api/delete_old?path=' + 'inputs/' + user + '/' + name_job - response = requests.get(url) + response = requests.post(os.path.join(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + + response = requests.get(os.path.join(settings.API_URL, "/api/delete_old?path=")) # delete local shutil.rmtree(full_path) - return {'directory': directory, 'formats': formats, 'name_job': name_job, 'filename': filename, 'user': user, 'limited': False} + return { + 'directory': directory, + 'formats': formats, + 'name_job': name_job, + 'filename': filename, + 'user': user, + 'limited': False + } From 4efcb388ffeabda5985053160359fe73929d9dd6 Mon Sep 17 00:00:00 2001 From: Serj Date: Sat, 29 Apr 2023 15:58:02 +0100 Subject: [PATCH 6/8] Added settings usage to the worker --- application/app.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/application/app.py b/application/app.py index e850544..cd0784e 100644 --- a/application/app.py +++ b/application/app.py @@ -194,6 +194,8 @@ def api_answer(): llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key) elif settings.LLM_NAME == "cohere": llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key) + else: + raise ValueError("unknown LLM model") if settings.LLM_NAME == "openai_chat": question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT) From 8742cdae0a04b67adefa0b79e14ba980c25c55cd Mon Sep 17 00:00:00 2001 From: Serj Date: Sun, 30 Apr 2023 10:46:52 +0100 Subject: [PATCH 7/8] Refactored url join --- application/core/settings.py | 6 +++--- application/worker.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/application/core/settings.py b/application/core/settings.py index 55c9400..87997a2 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -9,10 +9,10 @@ class Settings(BaseSettings): CELERY_RESULT_BACKEND: str MONGO_URI: str - API_URL: str = "http://localhost:5001" + API_URL: str = "http://localhost:5001" # backend url for celery worker - API_KEY: str = None - EMBEDDINGS_KEY: str = None + API_KEY: str = None # LLM api key + EMBEDDINGS_KEY: str = None # api key for embeddings (if using openai, just copy API_KEY path = Path(__file__).parent.parent.absolute() diff --git a/application/worker.py b/application/worker.py index c64c6ee..8498bfa 100644 --- a/application/worker.py +++ b/application/worker.py @@ -6,7 +6,8 @@ from parser.file.bulk import SimpleDirectoryReader from parser.schema.base import Document from parser.open_ai_func import call_openai_api from parser.token_func import group_split -from application.core.settings import settings +from urllib.parse import urljoin +from core.settings import settings import string @@ -41,7 +42,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): full_path = directory + '/' + user + '/' + name_job # check if API_URL env variable is set file_data = {'name': name_job, 'file': filename, 'user': user} - response = requests.get(os.path.join(settings.API_URL, "/api/download"), params=file_data) + response = requests.get(urljoin(settings.API_URL, "/api/download"), params=file_data) file = response.content if not os.path.exists(full_path): @@ -76,9 +77,9 @@ def ingest_worker(self, directory, formats, name_job, filename, user): file_data = {'name': name_job, 'user': user} files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), 'file_pkl': open(full_path + '/index.pkl', 'rb')} - response = requests.post(os.path.join(settings.API_URL, "/api/upload_index"), files=files, data=file_data) + response = requests.post(urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data) - response = requests.get(os.path.join(settings.API_URL, "/api/delete_old?path=")) + response = requests.get(urljoin(settings.API_URL, "/api/delete_old?path=")) # delete local shutil.rmtree(full_path) From 31350e63022abd4f2a506e535843bd8d532d0e6d Mon Sep 17 00:00:00 2001 From: Serj Date: Sun, 30 Apr 2023 11:03:09 +0100 Subject: [PATCH 8/8] Set celery and mongo urls as default --- .env-template | 2 ++ application/core/settings.py | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) create mode 100644 .env-template diff --git a/.env-template b/.env-template new file mode 100644 index 0000000..fe30b18 --- /dev/null +++ b/.env-template @@ -0,0 +1,2 @@ +API_KEY= +EMBEDDINGS_KEY= \ No newline at end of file diff --git a/application/core/settings.py b/application/core/settings.py index 87997a2..8c3ccb8 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -5,9 +5,9 @@ from pathlib import Path class Settings(BaseSettings): LLM_NAME: str = "openai_chat" EMBEDDINGS_NAME: str = "openai_text-embedding-ada-002" - CELERY_BROKER_URL: str - CELERY_RESULT_BACKEND: str - MONGO_URI: str + CELERY_BROKER_URL: str = "redis://localhost:6379/0" + CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1" + MONGO_URI: str = "mongodb://localhost:27017/docsgpt" API_URL: str = "http://localhost:5001" # backend url for celery worker