diff --git a/application/app.py b/application/app.py index 406445c..a136cc7 100644 --- a/application/app.py +++ b/application/app.py @@ -26,7 +26,7 @@ from pymongo import MongoClient from celery import Celery, current_task from celery.result import AsyncResult -from worker import my_background_task_worker, ingest_worker +from worker import ingest_worker # os.environ["LANGCHAIN_HANDLER"] = "langchain" @@ -395,6 +395,19 @@ def download_file(): save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name) return send_from_directory(save_dir, filename, as_attachment=True) +@app.route('/api/delete_old', methods=['get']) +def delete_old(): + """Delete old indexes.""" + import shutil + path = request.args.get('path') + first_dir = path.split('/')[0] + # check that path strats with indexes or vectors + if first_dir not in ['indexes', 'vectors']: + return {"status": 'error'} + shutil.rmtree(path) + vectors_collection.delete_one({'location': path}) + return {"status": 'ok'} + # handling CORS @app.after_request def after_request(response): diff --git a/application/worker.py b/application/worker.py index 2891406..5523131 100644 --- a/application/worker.py +++ b/application/worker.py @@ -10,9 +10,16 @@ from celery import current_task nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) +import string +import zipfile +import shutil +def generate_random_string(length): + return ''.join([string.ascii_letters[i % 52] for i in range(length)]) + + def ingest_worker(self, directory, formats, name_job, filename, user): - # directory = 'inputs' + # directory = 'inputs' or 'temp' # formats = [".rst", ".md"] input_files = None recursive = True @@ -21,21 +28,28 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # name_job = 'job1' # filename = 'install.rst' # user = 'local' + full_path = directory + '/' + user + '/' + name_job url = 'http://localhost:5001/api/download' file_data = {'name': name_job, 'file': filename, 'user': user} response = requests.get(url, params=file_data) file = response.content - # save in folder inputs - # create folder if not exists - if not os.path.exists(directory): - os.makedirs(directory) - with open(directory + '/' + filename, 'wb') as f: + + if not os.path.exists(full_path): + os.makedirs(full_path) + with open(full_path + '/' + filename, 'wb') as f: f.write(file) + #check if file is .zip and extract it + if filename.endswith('.zip'): + with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: + zip_ref.extractall(full_path) + os.remove(full_path + '/' + filename) + + import time self.update_state(state='PROGRESS', meta={'current': 1}) - raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=input_files, recursive=recursive, + raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, required_exts=formats, num_files_limit=limit, exclude_hidden=exclude).load_data() raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] @@ -43,15 +57,21 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # We do this due to the context limits of the LLMs. text_splitter = RecursiveCharacterTextSplitter() docs = text_splitter.split_documents(raw_docs) - call_openai_api(docs, directory, self) + call_openai_api(docs, full_path, self) self.update_state(state='PROGRESS', meta={'current': 100}) # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) url = 'http://localhost:5001/api/upload_index' file_data = {'name': name_job, 'user': user} - files = {'file_faiss': open(directory + '/index.faiss', 'rb'), - 'file_pkl': open(directory + '/index.pkl', 'rb')} + files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), + 'file_pkl': open(full_path + '/index.pkl', 'rb')} response = requests.post(url, files=files, data=file_data) - print(response.text) + + #deletes remote + url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job + response = requests.get(url) + # delete local + shutil.rmtree(full_path) + return {'directory': directory, 'formats': formats, 'name_job': name_job, 'filename': filename, 'user': user}