From 7a02df558849c43f9e257a7d5bdca5c74ed5b0fc Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 9 Apr 2024 19:56:07 +0400 Subject: [PATCH] Multiple uploads --- application/api/user/routes.py | 56 +++++++++++++++++++++------------- application/worker.py | 36 +++++++++++++++++++--- frontend/src/upload/Upload.tsx | 2 +- 3 files changed, 67 insertions(+), 27 deletions(-) diff --git a/application/api/user/routes.py b/application/api/user/routes.py index e80ec52..7e5462b 100644 --- a/application/api/user/routes.py +++ b/application/api/user/routes.py @@ -1,5 +1,6 @@ import os import uuid +import shutil from flask import Blueprint, request, jsonify from urllib.parse import urlparse import requests @@ -136,30 +137,43 @@ def upload_file(): return {"status": "no name"} job_name = secure_filename(request.form["name"]) # check if the post request has the file part - if "file" not in request.files: - print("No file part") - return {"status": "no file"} - file = request.files["file"] - if file.filename == "": + files = request.files.getlist("file") + + if not files or all(file.filename == '' for file in files): return {"status": "no file name"} - if file: - filename = secure_filename(file.filename) - # save dir - save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) - # create dir if not exists - if not os.path.exists(save_dir): - os.makedirs(save_dir) - - file.save(os.path.join(save_dir, filename)) - task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx", - ".csv", ".epub", ".html", ".mdx"], - job_name, filename, user) - # task id - task_id = task.id - return {"status": "ok", "task_id": task_id} + # Directory where files will be saved + save_dir = os.path.join(current_dir, settings.UPLOAD_FOLDER, user, job_name) + os.makedirs(save_dir, exist_ok=True) + + if len(files) > 1: + # Multiple files; prepare them for zip + temp_dir = os.path.join(save_dir, "temp") + os.makedirs(temp_dir, exist_ok=True) + + for file in files: + filename = secure_filename(file.filename) + file.save(os.path.join(temp_dir, filename)) + + # Use shutil.make_archive to zip the temp directory + zip_path = shutil.make_archive(base_name=os.path.join(save_dir, job_name), format='zip', root_dir=temp_dir) + final_filename = os.path.basename(zip_path) + + # Clean up the temporary directory after zipping + shutil.rmtree(temp_dir) else: - return {"status": "error"} + # Single file + file = files[0] + final_filename = secure_filename(file.filename) + file_path = os.path.join(save_dir, final_filename) + file.save(file_path) + + # Call ingest with the single file or zipped file + task = ingest.delay(settings.UPLOAD_FOLDER, [".rst", ".md", ".pdf", ".txt", ".docx", + ".csv", ".epub", ".html", ".mdx"], + job_name, final_filename, user) + + return {"status": "ok", "task_id": task.id} @user.route("/api/remote", methods=["POST"]) def upload_remote(): diff --git a/application/worker.py b/application/worker.py index 3891fde..eb28242 100644 --- a/application/worker.py +++ b/application/worker.py @@ -36,6 +36,32 @@ current_dir = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__))) ) +def extract_zip_recursive(zip_path, extract_to, current_depth=0, max_depth=5): + """ + Recursively extract zip files with a limit on recursion depth. + + Args: + zip_path (str): Path to the zip file to be extracted. + extract_to (str): Destination path for extracted files. + current_depth (int): Current depth of recursion. + max_depth (int): Maximum allowed depth of recursion to prevent infinite loops. + """ + if current_depth > max_depth: + print(f"Reached maximum recursion depth of {max_depth}") + return + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(extract_to) + os.remove(zip_path) # Remove the zip file after extracting + + # Check for nested zip files and extract them + for root, dirs, files in os.walk(extract_to): + for file in files: + if file.endswith(".zip"): + # If a nested zip file is found, extract it recursively + file_path = os.path.join(root, file) + extract_zip_recursive(file_path, root, current_depth + 1, max_depth) + # Define the main function for ingesting and processing documents. def ingest_worker(self, directory, formats, name_job, filename, user): @@ -66,9 +92,11 @@ def ingest_worker(self, directory, formats, name_job, filename, user): token_check = True min_tokens = 150 max_tokens = 1250 - full_path = directory + "/" + user + "/" + name_job + recursion_depth = 2 + full_path = os.path.join(directory, user, name_job) import sys + print(full_path, file=sys.stderr) # check if API_URL env variable is set file_data = {"name": name_job, "file": filename, "user": user} @@ -81,14 +109,12 @@ def ingest_worker(self, directory, formats, name_job, filename, user): if not os.path.exists(full_path): os.makedirs(full_path) - with open(full_path + "/" + filename, "wb") as f: + with open(os.path.join(full_path, filename), "wb") as f: f.write(file) # check if file is .zip and extract it if filename.endswith(".zip"): - with zipfile.ZipFile(full_path + "/" + filename, "r") as zip_ref: - zip_ref.extractall(full_path) - os.remove(full_path + "/" + filename) + extract_zip_recursive(os.path.join(full_path, filename), full_path, 0, recursion_depth) self.update_state(state="PROGRESS", meta={"current": 1}) diff --git a/frontend/src/upload/Upload.tsx b/frontend/src/upload/Upload.tsx index 39c2a09..3ae2178 100644 --- a/frontend/src/upload/Upload.tsx +++ b/frontend/src/upload/Upload.tsx @@ -201,7 +201,7 @@ export default function Upload({ const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, - multiple: false, + multiple: true, onDragEnter: doNothing, onDragOver: doNothing, onDragLeave: doNothing,