From 1d2162705d2a79fd5d1b4459a04c367789cf835e Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 13 Mar 2023 14:20:03 +0000 Subject: [PATCH 1/6] uploads backend first --- application/.env_sample | 6 +- application/app.py | 159 +++++++++++++++++- application/parser/__init__.py | 1 + application/parser/file/base.py | 20 +++ application/parser/file/base_parser.py | 38 +++++ application/parser/file/bulk.py | 163 ++++++++++++++++++ application/parser/file/docs_parser.py | 59 +++++++ application/parser/file/epub_parser.py | 43 +++++ application/parser/file/html_parser.py | 82 +++++++++ application/parser/file/markdown_parser.py | 144 ++++++++++++++++ application/parser/file/rst_parser.py | 186 +++++++++++++++++++++ application/parser/file/tabular_parser.py | 115 +++++++++++++ application/parser/java2doc.py | 61 +++++++ application/parser/js2doc.py | 67 ++++++++ application/parser/open_ai_func.py | 81 +++++++++ application/parser/py2doc.py | 113 +++++++++++++ application/parser/schema/base.py | 35 ++++ application/parser/schema/schema.py | 64 +++++++ application/static/dist/css/output.css | 50 +++++- application/static/src/chat.js | 4 +- application/templates/index.html | 42 +++-- application/worker.py | 57 +++++++ docker-compose.yaml | 21 +++ scripts/parser/file/bulk.py | 2 + scripts/parser/open_ai_func.py | 7 + 25 files changed, 1601 insertions(+), 19 deletions(-) create mode 100644 application/parser/__init__.py create mode 100644 application/parser/file/base.py create mode 100644 application/parser/file/base_parser.py create mode 100644 application/parser/file/bulk.py create mode 100644 application/parser/file/docs_parser.py create mode 100644 application/parser/file/epub_parser.py create mode 100644 application/parser/file/html_parser.py create mode 100644 application/parser/file/markdown_parser.py create mode 100644 application/parser/file/rst_parser.py create mode 100644 application/parser/file/tabular_parser.py create mode 100644 application/parser/java2doc.py create mode 100644 application/parser/js2doc.py create mode 100644 application/parser/open_ai_func.py create mode 100644 application/parser/py2doc.py create mode 100644 application/parser/schema/base.py create mode 100644 application/parser/schema/schema.py create mode 100644 application/worker.py diff --git a/application/.env_sample b/application/.env_sample index 3ad03db..38f92c0 100644 --- a/application/.env_sample +++ b/application/.env_sample @@ -1 +1,5 @@ -OPENAI_API_KEY=your_api_key \ No newline at end of file +OPENAI_API_KEY=your_api_key +EMBEDDINGS_KEY=your_api_key +CELERY_BROKER_URL=redis://localhost:6379/0 +CELERY_RESULT_BACKEND=redis://localhost:6379/1 +MONGO_URI=mongodb://localhost:27017/docsgpt \ No newline at end of file diff --git a/application/app.py b/application/app.py index 407e26b..406445c 100644 --- a/application/app.py +++ b/application/app.py @@ -1,10 +1,11 @@ import json import os import traceback +import datetime import dotenv import requests -from flask import Flask, request, render_template +from flask import Flask, request, render_template, redirect, send_from_directory, jsonify from langchain import FAISS from langchain import VectorDBQA, HuggingFaceHub, Cohere, OpenAI from langchain.chains.question_answering import load_qa_chain @@ -19,6 +20,14 @@ from langchain.prompts.chat import ( ) from error import bad_request +from werkzeug.utils import secure_filename +from pymongo import MongoClient + +from celery import Celery, current_task +from celery.result import AsyncResult + +from worker import my_background_task_worker, ingest_worker + # os.environ["LANGCHAIN_HANDLER"] = "langchain" @@ -53,6 +62,7 @@ if platform.system() == "Windows": # loading the .env file dotenv.load_dotenv() +# load the prompts with open("prompts/combine_prompt.txt", "r") as f: template = f.read() @@ -78,7 +88,20 @@ else: embeddings_key_set = False app = Flask(__name__) - +app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER = "inputs" +app.config['CELERY_BROKER_URL'] = os.getenv("CELERY_BROKER_URL") +app.config['CELERY_RESULT_BACKEND'] = os.getenv("CELERY_RESULT_BACKEND") +app.config['MONGO_URI'] = os.getenv("MONGO_URI") +celery = Celery(app.name, broker=app.config['CELERY_BROKER_URL'], backend=app.config['CELERY_RESULT_BACKEND']) +celery.conf.update(app.config) +mongo = MongoClient(app.config['MONGO_URI']) +db = mongo["docsgpt"] +vectors_collection = db["vectors"] + +@celery.task(bind=True) +def ingest(self, directory, formats, name_job, filename, user): + resp = ingest_worker(self, directory, formats, name_job, filename, user) + return resp @app.route("/") def home(): @@ -105,7 +128,10 @@ def api_answer(): try: # check if the vectorstore is set if "active_docs" in data: - vectorstore = "vectors/" + data["active_docs"] + if data["active_docs"].split("/")[0] == "local": + vectorstore = "indexes/" + data["active_docs"] + else: + vectorstore = "vectors/" + data["active_docs"] if data['active_docs'] == "default": vectorstore = "" else: @@ -160,7 +186,8 @@ def api_answer(): chain = VectorDBQA.from_chain_type(llm=llm, chain_type="map_reduce", vectorstore=docsearch, k=4, chain_type_kwargs={"question_prompt": p_chat_reduce, - "combine_prompt": p_chat_combine}) + "combine_prompt": p_chat_combine} + ) result = chain({"query": question}) else: qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce", @@ -195,6 +222,9 @@ def api_answer(): def check_docs(): # check if docs exist in a vectorstore folder data = request.get_json() + # split docs on / and take first part + if data["docs"].split("/")[0] == "local": + return {"status": 'exists'} vectorstore = "vectors/" + data["docs"] base_path = 'https://raw.githubusercontent.com/arc53/DocsHUB/main/' if os.path.exists(vectorstore) or data["docs"] == "default": @@ -243,6 +273,127 @@ def api_feedback(): ) return {"status": 'ok'} +@app.route('/api/combine', methods=['GET']) +def combined_json(): + user = 'local' + """Provide json file with combined available indexes.""" + # get json from https://d3dg1063dc54p9.cloudfront.net/combined.json + + data = [] + # structure: name, language, version, description, fullName, date, docLink + # append data from vectors_collection + for index in vectors_collection.find({'user': user}): + data.append({ + "name": index['name'], + "language": index['language'], + "version": '', + "description": index['name'], + "fullName": index['name'], + "date": index['date'], + "docLink": index['location'], + "model": embeddings_choice, + "location": "local" + }) + + data_remote = requests.get("https://d3dg1063dc54p9.cloudfront.net/combined.json").json() + for index in data_remote: + index['location'] = "remote" + data.append(index) + + + return jsonify(data) +@app.route('/api/upload', methods=['POST']) +def upload_file(): + """Upload a file to get vectorized and indexed.""" + if 'user' not in request.form: + return {"status": 'no user'} + user = request.form['user'] + if 'name' not in request.form: + return {"status": 'no name'} + job_name = request.form['name'] + # check if the post request has the file part + if 'file' not in request.files: + print('No file part') + return {"status": 'no file'} + file = request.files['file'] + if file.filename == '': + return {"status": 'no file name'} + + + if file: + filename = secure_filename(file.filename) + # save dir + save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name) + # create dir if not exists + if not os.path.exists(save_dir): + os.makedirs(save_dir) + + file.save(os.path.join(save_dir, filename)) + task = ingest.delay('temp', [".rst", ".md", ".pdf"], job_name, filename, user) + # task id + task_id = task.id + return {"status": 'ok', "task_id": task_id} + else: + return {"status": 'error'} + +@app.route('/api/task_status', methods=['GET']) +def task_status(): + """Get celery job status.""" + task_id = request.args.get('task_id') + task = AsyncResult(task_id) + task_meta = task.info + return {"status": task.status, "result": task_meta} + +### Backgound task api +@app.route('/api/upload_index', methods=['POST']) +def upload_index_files(): + """Upload two files(index.faiss, index.pkl) to the user's folder.""" + if 'user' not in request.form: + return {"status": 'no user'} + user = request.form['user'] + if 'name' not in request.form: + return {"status": 'no name'} + job_name = request.form['name'] + if 'file_faiss' not in request.files: + print('No file part') + return {"status": 'no file'} + file_faiss = request.files['file_faiss'] + if file_faiss.filename == '': + return {"status": 'no file name'} + if 'file_pkl' not in request.files: + print('No file part') + return {"status": 'no file'} + file_pkl = request.files['file_pkl'] + if file_pkl.filename == '': + return {"status": 'no file name'} + + # saves index files + save_dir = os.path.join('indexes', user, job_name) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + file_faiss.save(os.path.join(save_dir, 'index.faiss')) + file_pkl.save(os.path.join(save_dir, 'index.pkl')) + # create entry in vectors_collection + vectors_collection.insert_one({ + "user": user, + "name": job_name, + "language": job_name, + "location": save_dir, + "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"), + "model": embeddings_choice, + "type": "local" + }) + return {"status": 'ok'} + + + +@app.route('/api/download', methods=['get']) +def download_file(): + user = request.args.get('user') + job_name = request.args.get('name') + filename = request.args.get('file') + save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name) + return send_from_directory(save_dir, filename, as_attachment=True) # handling CORS @app.after_request diff --git a/application/parser/__init__.py b/application/parser/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/application/parser/__init__.py @@ -0,0 +1 @@ + diff --git a/application/parser/file/base.py b/application/parser/file/base.py new file mode 100644 index 0000000..c2777a0 --- /dev/null +++ b/application/parser/file/base.py @@ -0,0 +1,20 @@ +"""Base reader class.""" +from abc import abstractmethod +from typing import Any, List + +from langchain.docstore.document import Document as LCDocument + +from parser.schema.base import Document + + +class BaseReader: + """Utilities for loading data from a directory.""" + + @abstractmethod + def load_data(self, *args: Any, **load_kwargs: Any) -> List[Document]: + """Load data from the input directory.""" + + def load_langchain_documents(self, **load_kwargs: Any) -> List[LCDocument]: + """Load data in LangChain document format.""" + docs = self.load_data(**load_kwargs) + return [d.to_langchain_format() for d in docs] diff --git a/application/parser/file/base_parser.py b/application/parser/file/base_parser.py new file mode 100644 index 0000000..753a56f --- /dev/null +++ b/application/parser/file/base_parser.py @@ -0,0 +1,38 @@ +"""Base parser and config class.""" + +from abc import abstractmethod +from pathlib import Path +from typing import Dict, List, Optional, Union + + +class BaseParser: + """Base class for all parsers.""" + + def __init__(self, parser_config: Optional[Dict] = None): + """Init params.""" + self._parser_config = parser_config + + def init_parser(self) -> None: + """Init parser and store it.""" + parser_config = self._init_parser() + self._parser_config = parser_config + + @property + def parser_config_set(self) -> bool: + """Check if parser config is set.""" + return self._parser_config is not None + + @property + def parser_config(self) -> Dict: + """Check if parser config is set.""" + if self._parser_config is None: + raise ValueError("Parser config not set.") + return self._parser_config + + @abstractmethod + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + + @abstractmethod + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file.""" diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py new file mode 100644 index 0000000..b22f16b --- /dev/null +++ b/application/parser/file/bulk.py @@ -0,0 +1,163 @@ +"""Simple reader that reads files of different formats from a directory.""" +import logging +from pathlib import Path +from typing import Callable, Dict, List, Optional, Union + +from parser.file.base import BaseReader +from parser.file.base_parser import BaseParser +from parser.file.docs_parser import DocxParser, PDFParser +from parser.file.epub_parser import EpubParser +from parser.file.html_parser import HTMLParser +from parser.file.markdown_parser import MarkdownParser +from parser.file.rst_parser import RstParser +from parser.file.tabular_parser import PandasCSVParser +from parser.schema.base import Document + +DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { + ".pdf": PDFParser(), + ".docx": DocxParser(), + ".csv": PandasCSVParser(), + ".epub": EpubParser(), + ".md": MarkdownParser(), + ".rst": RstParser(), + ".html": HTMLParser(), + ".mdx": MarkdownParser(), +} + + +class SimpleDirectoryReader(BaseReader): + """Simple directory reader. + + Can read files into separate documents, or concatenates + files into one document text. + + Args: + input_dir (str): Path to the directory. + input_files (List): List of file paths to read (Optional; overrides input_dir) + exclude_hidden (bool): Whether to exclude hidden files (dotfiles). + errors (str): how encoding and decoding errors are to be handled, + see https://docs.python.org/3/library/functions.html#open + recursive (bool): Whether to recursively search in subdirectories. + False by default. + required_exts (Optional[List[str]]): List of required extensions. + Default is None. + file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file + extension to a BaseParser class that specifies how to convert that file + to text. See DEFAULT_FILE_EXTRACTOR. + num_files_limit (Optional[int]): Maximum number of files to read. + Default is None. + file_metadata (Optional[Callable[str, Dict]]): A function that takes + in a filename and returns a Dict of metadata for the Document. + Default is None. + """ + + def __init__( + self, + input_dir: Optional[str] = None, + input_files: Optional[List] = None, + exclude_hidden: bool = True, + errors: str = "ignore", + recursive: bool = True, + required_exts: Optional[List[str]] = None, + file_extractor: Optional[Dict[str, BaseParser]] = None, + num_files_limit: Optional[int] = None, + file_metadata: Optional[Callable[[str], Dict]] = None, + chunk_size_max: int = 2048, + ) -> None: + """Initialize with parameters.""" + super().__init__() + + if not input_dir and not input_files: + raise ValueError("Must provide either `input_dir` or `input_files`.") + + self.errors = errors + + self.recursive = recursive + self.exclude_hidden = exclude_hidden + self.required_exts = required_exts + self.num_files_limit = num_files_limit + + if input_files: + self.input_files = [] + for path in input_files: + print(path) + input_file = Path(path) + self.input_files.append(input_file) + elif input_dir: + self.input_dir = Path(input_dir) + self.input_files = self._add_files(self.input_dir) + + self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR + self.file_metadata = file_metadata + + def _add_files(self, input_dir: Path) -> List[Path]: + """Add files.""" + input_files = sorted(input_dir.iterdir()) + new_input_files = [] + dirs_to_explore = [] + for input_file in input_files: + if input_file.is_dir(): + if self.recursive: + dirs_to_explore.append(input_file) + elif self.exclude_hidden and input_file.name.startswith("."): + continue + elif ( + self.required_exts is not None + and input_file.suffix not in self.required_exts + ): + continue + else: + new_input_files.append(input_file) + + for dir_to_explore in dirs_to_explore: + sub_input_files = self._add_files(dir_to_explore) + new_input_files.extend(sub_input_files) + + if self.num_files_limit is not None and self.num_files_limit > 0: + new_input_files = new_input_files[0 : self.num_files_limit] + + # print total number of files added + logging.debug( + f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" + ) + + return new_input_files + + def load_data(self, concatenate: bool = False) -> List[Document]: + """Load data from the input directory. + + Args: + concatenate (bool): whether to concatenate all files into one document. + If set to True, file metadata is ignored. + False by default. + + Returns: + List[Document]: A list of documents. + + """ + data: Union[str, List[str]] = "" + data_list: List[str] = [] + metadata_list = [] + for input_file in self.input_files: + if input_file.suffix in self.file_extractor: + parser = self.file_extractor[input_file.suffix] + if not parser.parser_config_set: + parser.init_parser() + data = parser.parse_file(input_file, errors=self.errors) + else: + # do standard read + with open(input_file, "r", errors=self.errors) as f: + data = f.read() + if isinstance(data, List): + data_list.extend(data) + else: + data_list.append(str(data)) + if self.file_metadata is not None: + metadata_list.append(self.file_metadata(str(input_file))) + + if concatenate: + return [Document("\n".join(data_list))] + elif self.file_metadata is not None: + return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] + else: + return [Document(d) for d in data_list] diff --git a/application/parser/file/docs_parser.py b/application/parser/file/docs_parser.py new file mode 100644 index 0000000..0cde407 --- /dev/null +++ b/application/parser/file/docs_parser.py @@ -0,0 +1,59 @@ +"""Docs parser. + +Contains parsers for docx, pdf files. + +""" +from pathlib import Path +from typing import Dict + +from parser.file.base_parser import BaseParser + + +class PDFParser(BaseParser): + """PDF parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import PyPDF2 + except ImportError: + raise ValueError("PyPDF2 is required to read PDF files.") + text_list = [] + with open(file, "rb") as fp: + # Create a PDF object + pdf = PyPDF2.PdfReader(fp) + + # Get the number of pages in the PDF document + num_pages = len(pdf.pages) + + # Iterate over every page + for page in range(num_pages): + # Extract the text from the page + page_text = pdf.pages[page].extract_text() + text_list.append(page_text) + text = "\n".join(text_list) + + return text + + +class DocxParser(BaseParser): + """Docx parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import docx2txt + except ImportError: + raise ValueError("docx2txt is required to read Microsoft Word files.") + + text = docx2txt.process(file) + + return text diff --git a/application/parser/file/epub_parser.py b/application/parser/file/epub_parser.py new file mode 100644 index 0000000..6ece5ec --- /dev/null +++ b/application/parser/file/epub_parser.py @@ -0,0 +1,43 @@ +"""Epub parser. + +Contains parsers for epub files. +""" + +from pathlib import Path +from typing import Dict + +from parser.file.base_parser import BaseParser + + +class EpubParser(BaseParser): + """Epub Parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> str: + """Parse file.""" + try: + import ebooklib + from ebooklib import epub + except ImportError: + raise ValueError("`EbookLib` is required to read Epub files.") + try: + import html2text + except ImportError: + raise ValueError("`html2text` is required to parse Epub files.") + + text_list = [] + book = epub.read_epub(file, options={"ignore_ncx": True}) + + # Iterate through all chapters. + for item in book.get_items(): + # Chapters are typically located in epub documents items. + if item.get_type() == ebooklib.ITEM_DOCUMENT: + text_list.append( + html2text.html2text(item.get_content().decode("utf-8")) + ) + + text = "\n".join(text_list) + return text diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py new file mode 100644 index 0000000..53d7492 --- /dev/null +++ b/application/parser/file/html_parser.py @@ -0,0 +1,82 @@ +"""HTML parser. + +Contains parser for html files. + +""" +import re +from pathlib import Path +from typing import Dict, Union + +from parser.file.base_parser import BaseParser + +class HTMLParser(BaseParser): + """HTML parser.""" + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]: + """Parse file. + + Returns: + Union[str, List[str]]: a string or a List of strings. + """ + try: + import unstructured + except ImportError: + raise ValueError("unstructured package is required to parse HTML files.") + from unstructured.partition.html import partition_html + from unstructured.staging.base import convert_to_isd + from unstructured.cleaners.core import clean + + # Using the unstructured library to convert the html to isd format + # isd sample : isd = [ + # {"text": "My Title", "type": "Title"}, + # {"text": "My Narrative", "type": "NarrativeText"} + # ] + with open(file, "r", encoding="utf-8") as fp: + elements = partition_html(file=fp) + isd = convert_to_isd(elements) + + # Removing non ascii charactwers from isd_el['text'] + for isd_el in isd: + isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() + + # Removing all the \n characters from isd_el['text'] using regex and replace with single space + # Removing all the extra spaces from isd_el['text'] using regex and replace with single space + for isd_el in isd: + isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL) + isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL) + + # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation + for isd_el in isd: + clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True ) + + # Creating a list of all the indexes of isd_el['type'] = 'Title' + title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title'] + + # Creating 'Chunks' - List of lists of strings + # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' + # Each Chunk can be thought of as an individual set of data, which can be sent to the model + # Where Each Title is grouped together with the data under it + + Chunks = [[]] + final_chunks = list(list()) + + for i,isd_el in enumerate(isd): + if i in title_indexes: + Chunks.append([]) + Chunks[-1].append(isd_el['text']) + + # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable + for chunk in Chunks: + # sum of lenth of all the strings in the chunk + sum = 0 + sum += len(str(chunk)) + if sum < 25: + Chunks.remove(chunk) + else : + # appending all the approved chunks to final_chunks as a single string + final_chunks.append(" ".join([str(item) for item in chunk])) + return final_chunks diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py new file mode 100644 index 0000000..2dd9e43 --- /dev/null +++ b/application/parser/file/markdown_parser.py @@ -0,0 +1,144 @@ +"""Markdown parser. + +Contains parser for md files. + +""" +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +from parser.file.base_parser import BaseParser +import tiktoken + + +class MarkdownParser(BaseParser): + """Markdown parser. + + Extract text from markdown files. + Returns dictionary with keys as headers and values as the text between headers. + + """ + + def __init__( + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + max_tokens: int = 2048, + # remove_tables: bool = True, + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._remove_hyperlinks = remove_hyperlinks + self._remove_images = remove_images + self._max_tokens = max_tokens + # self._remove_tables = remove_tables + + + def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): + """Append to tups chunk.""" + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) + if num_tokens > self._max_tokens: + chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] + for chunk in chunks: + tups.append((current_header, chunk)) + else: + tups.append((current_header, current_text)) + return tups + def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: + """Convert a markdown file to a dictionary. + + The keys are the headers and the values are the text under each header. + + """ + markdown_tups: List[Tuple[Optional[str], str]] = [] + lines = markdown_text.split("\n") + + current_header = None + current_text = "" + + for line in lines: + header_match = re.match(r"^#+\s", line) + if header_match: + if current_header is not None: + if current_text == "" or None: + continue + markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) + + current_header = line + current_text = "" + else: + current_text += line + "\n" + markdown_tups = self.tups_chunk_append(markdown_tups, current_header, current_text) + + if current_header is not None: + # pass linting, assert keys are defined + markdown_tups = [ + (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) + for key, value in markdown_tups + ] + else: + markdown_tups = [ + (key, re.sub("\n", "", value)) for key, value in markdown_tups + ] + + return markdown_tups + + def remove_images(self, content: str) -> str: + """Get a dictionary of a markdown file from its path.""" + pattern = r"!{1}\[\[(.*)\]\]" + content = re.sub(pattern, "", content) + return content + + # def remove_tables(self, content: str) -> List[List[str]]: + # """Convert markdown tables to nested lists.""" + # table_rows_pattern = r"((\r?\n){2}|^)([^\r\n]*\|[^\r\n]*(\r?\n)?)+(?=(\r?\n){2}|$)" + # table_cells_pattern = r"([^\|\r\n]*)\|" + # + # table_rows = re.findall(table_rows_pattern, content, re.MULTILINE) + # table_lists = [] + # for row in table_rows: + # cells = re.findall(table_cells_pattern, row[2]) + # cells = [cell.strip() for cell in cells if cell.strip()] + # table_lists.append(cells) + # return str(table_lists) + + def remove_hyperlinks(self, content: str) -> str: + """Get a dictionary of a markdown file from its path.""" + pattern = r"\[(.*?)\]\((.*?)\)" + content = re.sub(pattern, r"\1", content) + return content + + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + return {} + + def parse_tups( + self, filepath: Path, errors: str = "ignore" + ) -> List[Tuple[Optional[str], str]]: + """Parse file into tuples.""" + with open(filepath, "r") as f: + content = f.read() + if self._remove_hyperlinks: + content = self.remove_hyperlinks(content) + if self._remove_images: + content = self.remove_images(content) + # if self._remove_tables: + # content = self.remove_tables(content) + markdown_tups = self.markdown_to_tups(content) + return markdown_tups + + def parse_file( + self, filepath: Path, errors: str = "ignore" + ) -> Union[str, List[str]]: + """Parse file into string.""" + tups = self.parse_tups(filepath, errors=errors) + results = [] + # TODO: don't include headers right now + for header, value in tups: + if header is None: + results.append(value) + else: + results.append(f"\n\n{header}\n{value}") + return results diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py new file mode 100644 index 0000000..0a4724f --- /dev/null +++ b/application/parser/file/rst_parser.py @@ -0,0 +1,186 @@ +"""reStructuredText parser. + +Contains parser for md files. + +""" +import re +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union, cast + +from parser.file.base_parser import BaseParser +import tiktoken + +class RstParser(BaseParser): + """reStructuredText parser. + + Extract text from .rst files. + Returns dictionary with keys as headers and values as the text between headers. + + """ + + def __init__( + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, + remove_whitespaces_excess: bool = True, + #Be carefull with remove_characters_excess, might cause data loss + remove_characters_excess: bool = True, + max_tokens: int = 2048, + **kwargs: Any, + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._remove_hyperlinks = remove_hyperlinks + self._remove_images = remove_images + self._remove_table_excess = remove_table_excess + self._remove_interpreters = remove_interpreters + self._remove_directives = remove_directives + self._remove_whitespaces_excess = remove_whitespaces_excess + self._remove_characters_excess = remove_characters_excess + self._max_tokens = max_tokens + + def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): + """Append to tups chunk.""" + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) + if num_tokens > self._max_tokens: + chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)] + for chunk in chunks: + tups.append((current_header, chunk)) + else: + tups.append((current_header, current_text)) + return tups + + + def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: + """Convert a reStructuredText file to a dictionary. + + The keys are the headers and the values are the text under each header. + + """ + rst_tups: List[Tuple[Optional[str], str]] = [] + lines = rst_text.split("\n") + + current_header = None + current_text = "" + + for i, line in enumerate(lines): + header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line) + if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]): + if current_header is not None: + if current_text == "" or None: + continue + # removes the next heading from current Document + if current_text.endswith(lines[i - 1] + "\n"): + current_text = current_text[:len(current_text) - len(lines[i - 1] + "\n")] + rst_tups = self.tups_chunk_append(rst_tups, current_header, current_text) + + current_header = lines[i - 1] + current_text = "" + else: + current_text += line + "\n" + + rst_tups = self.tups_chunk_append(rst_tups, current_header, current_text) + + #TODO: Format for rst + # + # if current_header is not None: + # # pass linting, assert keys are defined + # rst_tups = [ + # (re.sub(r"#", "", cast(str, key)).strip(), re.sub(r"<.*?>", "", value)) + # for key, value in rst_tups + # ] + # else: + # rst_tups = [ + # (key, re.sub("\n", "", value)) for key, value in rst_tups + # ] + + if current_header is None: + rst_tups = [ + (key, re.sub("\n", "", value)) for key, value in rst_tups + ] + return rst_tups + + def remove_images(self, content: str) -> str: + pattern = r"\.\. image:: (.*)" + content = re.sub(pattern, "", content) + return content + + def remove_hyperlinks(self, content: str) -> str: + pattern = r"`(.*?) <(.*?)>`_" + content = re.sub(pattern, r"\1", content) + return content + + def remove_directives(self, content: str) -> str: + """Removes reStructuredText Directives""" + pattern = r"`\.\.([^:]+)::" + content = re.sub(pattern, "", content) + return content + + def remove_interpreters(self, content: str) -> str: + """Removes reStructuredText Interpreted Text Roles""" + pattern = r":(\w+):" + content = re.sub(pattern, "", content) + return content + + def remove_table_excess(self, content: str) -> str: + """Pattern to remove grid table separators""" + pattern = r"^\+[-]+\+[-]+\+$" + content = re.sub(pattern, "", content, flags=re.MULTILINE) + return content + + def remove_whitespaces_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]: + """Pattern to match 2 or more consecutive whitespaces""" + pattern = r"\s{2,}" + content = [(key, re.sub(pattern, " ", value)) for key, value in content] + return content + + def remove_characters_excess(self, content: List[Tuple[str, Any]]) -> List[Tuple[str, Any]]: + """Pattern to match 2 or more consecutive characters""" + pattern = r"(\S)\1{2,}" + content = [(key, re.sub(pattern, r"\1\1\1", value, flags=re.MULTILINE)) for key, value in content] + return content + + def _init_parser(self) -> Dict: + """Initialize the parser with the config.""" + return {} + + def parse_tups( + self, filepath: Path, errors: str = "ignore" + ) -> List[Tuple[Optional[str], str]]: + """Parse file into tuples.""" + with open(filepath, "r") as f: + content = f.read() + if self._remove_hyperlinks: + content = self.remove_hyperlinks(content) + if self._remove_images: + content = self.remove_images(content) + if self._remove_table_excess: + content = self.remove_table_excess(content) + if self._remove_directives: + content = self.remove_directives(content) + if self._remove_interpreters: + content = self.remove_interpreters(content) + rst_tups = self.rst_to_tups(content) + if self._remove_whitespaces_excess: + rst_tups = self.remove_whitespaces_excess(rst_tups) + if self._remove_characters_excess: + rst_tups = self.remove_characters_excess(rst_tups) + return rst_tups + + def parse_file( + self, filepath: Path, errors: str = "ignore" + ) -> Union[str, List[str]]: + """Parse file into string.""" + tups = self.parse_tups(filepath, errors=errors) + results = [] + # TODO: don't include headers right now + for header, value in tups: + if header is None: + results.append(value) + else: + results.append(f"\n\n{header}\n{value}") + return results diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py new file mode 100644 index 0000000..bbb875e --- /dev/null +++ b/application/parser/file/tabular_parser.py @@ -0,0 +1,115 @@ +"""Tabular parser. + +Contains parsers for tabular data files. + +""" +from pathlib import Path +from typing import Any, Dict, List, Union + +from parser.file.base_parser import BaseParser + + +class CSVParser(BaseParser): + """CSV parser. + + Args: + concat_rows (bool): whether to concatenate all rows into one document. + If set to False, a Document will be created for each row. + True by default. + + """ + + def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file. + + Returns: + Union[str, List[str]]: a string or a List of strings. + + """ + try: + import csv + except ImportError: + raise ValueError("csv module is required to read CSV files.") + text_list = [] + with open(file, "r") as fp: + csv_reader = csv.reader(fp) + for row in csv_reader: + text_list.append(", ".join(row)) + if self._concat_rows: + return "\n".join(text_list) + else: + return text_list + + +class PandasCSVParser(BaseParser): + r"""Pandas-based CSV parser. + + Parses CSVs using the separator detection from Pandas `read_csv`function. + If special parameters are required, use the `pandas_config` dict. + + Args: + concat_rows (bool): whether to concatenate all rows into one document. + If set to False, a Document will be created for each row. + True by default. + + col_joiner (str): Separator to use for joining cols per row. + Set to ", " by default. + + row_joiner (str): Separator to use for joining each row. + Only used when `concat_rows=True`. + Set to "\n" by default. + + pandas_config (dict): Options for the `pandas.read_csv` function call. + Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html + for more information. + Set to empty dict by default, this means pandas will try to figure + out the separators, table head, etc. on its own. + + """ + + def __init__( + self, + *args: Any, + concat_rows: bool = True, + col_joiner: str = ", ", + row_joiner: str = "\n", + pandas_config: dict = {}, + **kwargs: Any + ) -> None: + """Init params.""" + super().__init__(*args, **kwargs) + self._concat_rows = concat_rows + self._col_joiner = col_joiner + self._row_joiner = row_joiner + self._pandas_config = pandas_config + + def _init_parser(self) -> Dict: + """Init parser.""" + return {} + + def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: + """Parse file.""" + try: + import pandas as pd + except ImportError: + raise ValueError("pandas module is required to read CSV files.") + + df = pd.read_csv(file, **self._pandas_config) + + text_list = df.apply( + lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 + ).tolist() + + if self._concat_rows: + return (self._row_joiner).join(text_list) + else: + return text_list diff --git a/application/parser/java2doc.py b/application/parser/java2doc.py new file mode 100644 index 0000000..c1701c5 --- /dev/null +++ b/application/parser/java2doc.py @@ -0,0 +1,61 @@ +import os +import javalang + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.java'): + files_list.append(os.path.join(root, file)) + return files_list + +def extract_functions(file_path): + with open(file_path, "r") as file: + java_code = file.read() + methods = {} + tree = javalang.parse.parse(java_code) + for _, node in tree.filter(javalang.tree.MethodDeclaration): + method_name = node.name + start_line = node.position.line - 1 + end_line = start_line + brace_count = 0 + for line in java_code.splitlines()[start_line:]: + end_line += 1 + brace_count += line.count("{") - line.count("}") + if brace_count == 0: + break + method_source_code = "\n".join(java_code.splitlines()[start_line:end_line]) + methods[method_name] = method_source_code + return methods + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = javalang.parse.parse(source_code) + for class_decl in tree.types: + class_name = class_decl.name + declarations = [] + methods = [] + for field_decl in class_decl.fields: + field_name = field_decl.declarators[0].name + field_type = field_decl.type.name + declarations.append(f"{field_type} {field_name}") + for method_decl in class_decl.methods: + methods.append(method_decl.name) + class_string = "Declarations: " + ", ".join(declarations) + "\n Method name: " + ", ".join(methods) + classes[class_name] = class_string + return classes + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict \ No newline at end of file diff --git a/application/parser/js2doc.py b/application/parser/js2doc.py new file mode 100644 index 0000000..d434ab2 --- /dev/null +++ b/application/parser/js2doc.py @@ -0,0 +1,67 @@ +import os +import esprima +import escodegen + + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.js'): + files_list.append(os.path.join(root, file)) + return files_list + +def extract_functions(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + functions = {} + tree = esprima.parseScript(source_code) + for node in tree.body: + if node.type == 'FunctionDeclaration': + func_name = node.id.name if node.id else '' + functions[func_name] = escodegen.generate(node) + elif node.type == 'VariableDeclaration': + for declaration in node.declarations: + if declaration.init and declaration.init.type == 'FunctionExpression': + func_name = declaration.id.name if declaration.id else '' + functions[func_name] = escodegen.generate(declaration.init) + elif node.type == 'ClassDeclaration': + class_name = node.id.name + for subnode in node.body.body: + if subnode.type == 'MethodDefinition': + func_name = subnode.key.name + functions[func_name] = escodegen.generate(subnode.value) + elif subnode.type == 'VariableDeclaration': + for declaration in subnode.declarations: + if declaration.init and declaration.init.type == 'FunctionExpression': + func_name = declaration.id.name if declaration.id else '' + functions[func_name] = escodegen.generate(declaration.init) + return functions + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = esprima.parseScript(source_code) + for node in tree.body: + if node.type == 'ClassDeclaration': + class_name = node.id.name + function_names = [] + for subnode in node.body.body: + if subnode.type == 'MethodDefinition': + function_names.append(subnode.key.name) + classes[class_name] = ", ".join(function_names) + return classes + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py new file mode 100644 index 0000000..c25b2d0 --- /dev/null +++ b/application/parser/open_ai_func.py @@ -0,0 +1,81 @@ +import os +import faiss +import pickle +import tiktoken +from langchain.vectorstores import FAISS +from langchain.embeddings import OpenAIEmbeddings + +#from langchain.embeddings import HuggingFaceEmbeddings +#from langchain.embeddings import HuggingFaceInstructEmbeddings +#from langchain.embeddings import CohereEmbeddings + +from retry import retry + + + +def num_tokens_from_string(string: str, encoding_name: str) -> int: +# Function to convert string to tokens and estimate user cost. + encoding = tiktoken.get_encoding(encoding_name) + num_tokens = len(encoding.encode(string)) + total_price = ((num_tokens/1000) * 0.0004) + return num_tokens, total_price + +@retry(tries=10, delay=60) +def store_add_texts_with_retry(store, i): + store.add_texts([i.page_content], metadatas=[i.metadata]) + #store_pine.add_texts([i.page_content], metadatas=[i.metadata]) + +def call_openai_api(docs, folder_name, task_status): +# Function to create a vector store from the documents and save it to disk. + + # create output folder if it doesn't exist + if not os.path.exists(f"{folder_name}"): + os.makedirs(f"{folder_name}") + + from tqdm import tqdm + docs_test = [docs[0]] + docs.pop(0) + c1 = 0 + + store = FAISS.from_documents(docs_test, OpenAIEmbeddings(openai_api_key=os.getenv("EMBEDDINGS_KEY"))) + + # Uncomment for MPNet embeddings + # model_name = "sentence-transformers/all-mpnet-base-v2" + # hf = HuggingFaceEmbeddings(model_name=model_name) + # store = FAISS.from_documents(docs_test, hf) + s1 = len(docs) + for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): + try: + task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)}) + store_add_texts_with_retry(store, i) + except Exception as e: + print(e) + print("Error on ", i) + print("Saving progress") + print(f"stopped at {c1} out of {len(docs)}") + store.save_local(f"{folder_name}") + break + c1 += 1 + store.save_local(f"{folder_name}") + +def get_user_permission(docs, folder_name): +# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. + # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. + #docs_content = (" ".join(docs)) + docs_content = "" + for doc in docs: + docs_content += doc.page_content + + + tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") + # Here we print the number of tokens and the approx user cost with some visually appealing formatting. + print(f"Number of Tokens = {format(tokens, ',d')}") + print(f"Approx Cost = ${format(total_price, ',.2f')}") + #Here we check for user permission before calling the API. + user_input = input("Price Okay? (Y/N) \n").lower() + if user_input == "y": + call_openai_api(docs, folder_name) + elif user_input == "": + call_openai_api(docs, folder_name) + else: + print("The API was not called. No money was spent.") diff --git a/application/parser/py2doc.py b/application/parser/py2doc.py new file mode 100644 index 0000000..4ac73cd --- /dev/null +++ b/application/parser/py2doc.py @@ -0,0 +1,113 @@ +import os +import ast +import tiktoken +from pathlib import Path +from langchain.llms import OpenAI +from langchain.prompts import PromptTemplate + +def find_files(directory): + files_list = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.py'): + files_list.append(os.path.join(root, file)) + return files_list + +def extract_functions(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + functions = {} + tree = ast.parse(source_code) + for node in ast.walk(tree): + if isinstance(node, ast.FunctionDef): + func_name = node.name + func_def = ast.get_source_segment(source_code, node) + functions[func_name] = func_def + return functions + +def extract_classes(file_path): + with open(file_path, 'r') as file: + source_code = file.read() + classes = {} + tree = ast.parse(source_code) + for node in ast.walk(tree): + if isinstance(node, ast.ClassDef): + class_name = node.name + function_names = [] + for subnode in ast.walk(node): + if isinstance(subnode, ast.FunctionDef): + function_names.append(subnode.name) + classes[class_name] = ", ".join(function_names) + return classes + +def extract_functions_and_classes(directory): + files = find_files(directory) + functions_dict = {} + classes_dict = {} + for file in files: + functions = extract_functions(file) + if functions: + functions_dict[file] = functions + classes = extract_classes(file) + if classes: + classes_dict[file] = classes + return functions_dict, classes_dict + +def parse_functions(functions_dict, formats, dir): + c1 = len(functions_dict) + for i, (source, functions) in enumerate(functions_dict.items(), start=1): + print(f"Processing file {i}/{c1}") + source_w = source.replace(dir+"/", "").replace("."+formats, ".md") + subfolders = "/".join(source_w.split("/")[:-1]) + Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) + for j, (name, function) in enumerate(functions.items(), start=1): + print(f"Processing function {j}/{len(functions)}") + prompt = PromptTemplate( + input_variables=["code"], + template="Code: \n{code}, \nDocumentation: ", + ) + llm = OpenAI(temperature=0) + response = llm(prompt.format(code=function)) + mode = "a" if Path(f"outputs/{source_w}").exists() else "w" + with open(f"outputs/{source_w}", mode) as f: + f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") + + +def parse_classes(classes_dict, formats, dir): + c1 = len(classes_dict) + for i, (source, classes) in enumerate(classes_dict.items()): + print(f"Processing file {i+1}/{c1}") + source_w = source.replace(dir+"/", "").replace("."+formats, ".md") + subfolders = "/".join(source_w.split("/")[:-1]) + Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) + for name, function_names in classes.items(): + print(f"Processing Class {i+1}/{c1}") + prompt = PromptTemplate( + input_variables=["class_name", "functions_names"], + template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ", + ) + llm = OpenAI(temperature=0) + response = llm(prompt.format(class_name=name, functions_names=function_names)) + + with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f: + f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") + +def transform_to_docs(functions_dict, classes_dict, formats, dir): + docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()]) + docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()]) + + num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(docs_content)) + total_price = ((num_tokens / 1000) * 0.02) + + print(f"Number of Tokens = {num_tokens:,d}") + print(f"Approx Cost = ${total_price:,.2f}") + + user_input = input("Price Okay? (Y/N)\n").lower() + if user_input == "y" or user_input == "": + if not Path("outputs").exists(): + Path("outputs").mkdir() + parse_functions(functions_dict, formats, dir) + parse_classes(classes_dict, formats, dir) + print("All done!") + else: + print("The API was not called. No money was spent.") \ No newline at end of file diff --git a/application/parser/schema/base.py b/application/parser/schema/base.py new file mode 100644 index 0000000..0871f88 --- /dev/null +++ b/application/parser/schema/base.py @@ -0,0 +1,35 @@ +"""Base schema for readers.""" +from dataclasses import dataclass + +from langchain.docstore.document import Document as LCDocument + +from parser.schema.schema import BaseDocument + + +@dataclass +class Document(BaseDocument): + """Generic interface for a data document. + + This document connects to data sources. + + """ + + def __post_init__(self) -> None: + """Post init.""" + if self.text is None: + raise ValueError("text field not set.") + + @classmethod + def get_type(cls) -> str: + """Get Document type.""" + return "Document" + + def to_langchain_format(self) -> LCDocument: + """Convert struct to LangChain document format.""" + metadata = self.extra_info or {} + return LCDocument(page_content=self.text, metadata=metadata) + + @classmethod + def from_langchain_format(cls, doc: LCDocument) -> "Document": + """Convert struct from LangChain document format.""" + return cls(text=doc.page_content, extra_info=doc.metadata) diff --git a/application/parser/schema/schema.py b/application/parser/schema/schema.py new file mode 100644 index 0000000..ec467e5 --- /dev/null +++ b/application/parser/schema/schema.py @@ -0,0 +1,64 @@ +"""Base schema for data structures.""" +from abc import abstractmethod +from dataclasses import dataclass +from typing import Any, Dict, List, Optional + +from dataclasses_json import DataClassJsonMixin + + +@dataclass +class BaseDocument(DataClassJsonMixin): + """Base document. + + Generic abstract interfaces that captures both index structs + as well as documents. + + """ + + # TODO: consolidate fields from Document/IndexStruct into base class + text: Optional[str] = None + doc_id: Optional[str] = None + embedding: Optional[List[float]] = None + + # extra fields + extra_info: Optional[Dict[str, Any]] = None + + @classmethod + @abstractmethod + def get_type(cls) -> str: + """Get Document type.""" + + def get_text(self) -> str: + """Get text.""" + if self.text is None: + raise ValueError("text field not set.") + return self.text + + def get_doc_id(self) -> str: + """Get doc_id.""" + if self.doc_id is None: + raise ValueError("doc_id not set.") + return self.doc_id + + @property + def is_doc_id_none(self) -> bool: + """Check if doc_id is None.""" + return self.doc_id is None + + def get_embedding(self) -> List[float]: + """Get embedding. + + Errors if embedding is None. + + """ + if self.embedding is None: + raise ValueError("embedding not set.") + return self.embedding + + @property + def extra_info_str(self) -> Optional[str]: + """Extra info string.""" + if self.extra_info is None: + return None + + return "\n".join([f"{k}: {str(v)}" for k, v in self.extra_info.items()]) diff --git a/application/static/dist/css/output.css b/application/static/dist/css/output.css index 1b5360e..a8ac162 100644 --- a/application/static/dist/css/output.css +++ b/application/static/dist/css/output.css @@ -525,6 +525,10 @@ video { position: absolute; } +.relative { + position: relative; +} + .inset-0 { top: 0px; right: 0px; @@ -604,6 +608,10 @@ video { min-height: 100vh; } +.w-auto { + width: auto; +} + .w-full { width: 100%; } @@ -648,12 +656,16 @@ video { overflow-y: auto; } +.rounded { + border-radius: 0.25rem; +} + .rounded-lg { border-radius: 0.5rem; } -.rounded { - border-radius: 0.25rem; +.rounded-md { + border-radius: 0.375rem; } .border { @@ -723,6 +735,11 @@ video { padding-bottom: 0.5rem; } +.py-4 { + padding-top: 1rem; + padding-bottom: 1rem; +} + .pt-4 { padding-top: 1rem; } @@ -761,6 +778,11 @@ video { line-height: 1.25rem; } +.text-xl { + font-size: 1.25rem; + line-height: 1.75rem; +} + .font-medium { font-weight: 500; } @@ -842,6 +864,11 @@ video { } } +.hover\:bg-blue-600:hover { + --tw-bg-opacity: 1; + background-color: rgb(37 99 235 / var(--tw-bg-opacity)); +} + .hover\:bg-blue-700:hover { --tw-bg-opacity: 1; background-color: rgb(29 78 216 / var(--tw-bg-opacity)); @@ -862,11 +889,26 @@ video { border-color: rgb(59 130 246 / var(--tw-border-opacity)); } +.focus\:outline-none:focus { + outline: 2px solid transparent; + outline-offset: 2px; +} + +.focus\:ring-2:focus { + --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); + --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color); + box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); +} + .focus\:ring-blue-500:focus { --tw-ring-opacity: 1; --tw-ring-color: rgb(59 130 246 / var(--tw-ring-opacity)); } +.focus\:ring-offset-2:focus { + --tw-ring-offset-width: 2px; +} + @media (min-width: 640px) { .sm\:my-8 { margin-top: 2rem; @@ -881,6 +923,10 @@ video { display: inline-block; } + .sm\:inline { + display: inline; + } + .sm\:h-screen { height: 100vh; } diff --git a/application/static/src/chat.js b/application/static/src/chat.js index 3bac21f..5422687 100644 --- a/application/static/src/chat.js +++ b/application/static/src/chat.js @@ -71,4 +71,6 @@ function submitForm(event){ }); } -window.addEventListener('submit',submitForm) +//window.addEventListener('submit',submitForm) +// rewrite using id = button-submit +document.getElementById("button-submit").addEventListener('click',submitForm) diff --git a/application/templates/index.html b/application/templates/index.html index 4f8e471..1d78cca 100644 --- a/application/templates/index.html +++ b/application/templates/index.html @@ -86,6 +86,19 @@ This will return a new DataFrame with all the columns from both tables, and only +
+ + + + + + +
+ + + @@ -130,7 +143,7 @@ This will return a new DataFrame with all the columns from both tables, and only function docsIndex() { // loads latest index from https://raw.githubusercontent.com/arc53/DocsHUB/main/combined.json // and stores it in localStorage - fetch('https://d3dg1063dc54p9.cloudfront.net/combined.json') + fetch('/api/combine') .then(response => response.json()) .then(data => { localStorage.setItem("docsIndex", JSON.stringify(data)); @@ -150,19 +163,26 @@ This will return a new DataFrame with all the columns from both tables, and only // create option for each key in docsIndex for (var key in docsIndex) { var option = document.createElement("option"); - if (docsIndex[key].name == docsIndex[key].language) { - option.text = docsIndex[key].name + " " + docsIndex[key].version; - option.value = docsIndex[key].name + "/" + ".project" + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/"; - if (docsIndex[key].model == "{{ embeddings_choice }}") { - select.add(option); + if (docsIndex[key].location == 'docshub'){ + if (docsIndex[key].name == docsIndex[key].language) { + option.text = docsIndex[key].name + " " + docsIndex[key].version; + option.value = docsIndex[key].name + "/" + ".project" + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/"; + if (docsIndex[key].model == "{{ embeddings_choice }}") { + select.add(option); + } + } + else { + option.text = docsIndex[key].name + " " + docsIndex[key].version; + option.value = docsIndex[key].language + "/" + docsIndex[key].name + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/"; + if (docsIndex[key].model == "{{ embeddings_choice }}") { + select.add(option); + } } } else { - option.text = docsIndex[key].name + " " + docsIndex[key].version; - option.value = docsIndex[key].language + "/" + docsIndex[key].name + "/" + docsIndex[key].version + "/{{ embeddings_choice }}/"; - if (docsIndex[key].model == "{{ embeddings_choice }}") { - select.add(option); - } + option.text = docsIndex[key].name; + option.value = docsIndex[key].location + "/" + docsIndex[key].name; + select.add(option); } } diff --git a/application/worker.py b/application/worker.py new file mode 100644 index 0000000..2891406 --- /dev/null +++ b/application/worker.py @@ -0,0 +1,57 @@ +import requests +import nltk +import os +from langchain.text_splitter import RecursiveCharacterTextSplitter + +from parser.file.bulk import SimpleDirectoryReader +from parser.schema.base import Document +from parser.open_ai_func import call_openai_api +from celery import current_task + +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) + +def ingest_worker(self, directory, formats, name_job, filename, user): + # directory = 'inputs' + # formats = [".rst", ".md"] + input_files = None + recursive = True + limit = None + exclude = True + # name_job = 'job1' + # filename = 'install.rst' + # user = 'local' + url = 'http://localhost:5001/api/download' + file_data = {'name': name_job, 'file': filename, 'user': user} + response = requests.get(url, params=file_data) + file = response.content + # save in folder inputs + # create folder if not exists + if not os.path.exists(directory): + os.makedirs(directory) + with open(directory + '/' + filename, 'wb') as f: + f.write(file) + + import time + self.update_state(state='PROGRESS', meta={'current': 1}) + + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=input_files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + call_openai_api(docs, directory, self) + self.update_state(state='PROGRESS', meta={'current': 100}) + + # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl + # and send them to the server (provide user and name in form) + url = 'http://localhost:5001/api/upload_index' + file_data = {'name': name_job, 'user': user} + files = {'file_faiss': open(directory + '/index.faiss', 'rb'), + 'file_pkl': open(directory + '/index.pkl', 'rb')} + response = requests.post(url, files=files, data=file_data) + print(response.text) + return {'directory': directory, 'formats': formats, 'name_job': name_job, 'filename': filename, 'user': user} diff --git a/docker-compose.yaml b/docker-compose.yaml index 0ed9d41..2a96868 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -7,9 +7,30 @@ services: - API_HOST=http://backend:5001 ports: - "5173:5173" + depends_on: + - backend backend: build: ./application ports: - "5001:5001" + depends_on: + - redis + - mongo + redis: + image: redis:6-alpine + ports: + - 6379:6379 + + mongo: + image: mongo:6 + ports: + - 27017:27017 + volumes: + - mongodb_data_container:/data/db + + + +volumes: + mongodb_data_container: \ No newline at end of file diff --git a/scripts/parser/file/bulk.py b/scripts/parser/file/bulk.py index cebc0b5..871123c 100644 --- a/scripts/parser/file/bulk.py +++ b/scripts/parser/file/bulk.py @@ -76,6 +76,8 @@ class SimpleDirectoryReader(BaseReader): self.exclude_hidden = exclude_hidden self.required_exts = required_exts self.num_files_limit = num_files_limit + print("input_files") + print(input_files) if input_files: self.input_files = [] diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index a89f948..368e0d5 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -23,6 +23,7 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: @retry(tries=10, delay=60) def store_add_texts_with_retry(store, i): store.add_texts([i.page_content], metadatas=[i.metadata]) + #store_pine.add_texts([i.page_content], metadatas=[i.metadata]) def call_openai_api(docs, folder_name): # Function to create a vector store from the documents and save it to disk. @@ -38,7 +39,13 @@ def call_openai_api(docs, folder_name): # cut first n docs if you want to restart #docs = docs[:n] c1 = 0 + # pinecone.init( + # api_key="", # find at app.pinecone.io + # environment="us-east1-gcp" # next to api key in console + # ) + #index_name = "pandas" store = FAISS.from_documents(docs_test, OpenAIEmbeddings()) + #store_pine = Pinecone.from_documents(docs_test, OpenAIEmbeddings(), index_name=index_name) # Uncomment for MPNet embeddings # model_name = "sentence-transformers/all-mpnet-base-v2" From c297e076e602477964c6b32a597773cbf47dac28 Mon Sep 17 00:00:00 2001 From: Alex Date: Mon, 13 Mar 2023 21:56:09 +0000 Subject: [PATCH 2/6] folders --- application/app.py | 15 ++++++++++++++- application/worker.py | 42 +++++++++++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 12 deletions(-) diff --git a/application/app.py b/application/app.py index 406445c..a136cc7 100644 --- a/application/app.py +++ b/application/app.py @@ -26,7 +26,7 @@ from pymongo import MongoClient from celery import Celery, current_task from celery.result import AsyncResult -from worker import my_background_task_worker, ingest_worker +from worker import ingest_worker # os.environ["LANGCHAIN_HANDLER"] = "langchain" @@ -395,6 +395,19 @@ def download_file(): save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name) return send_from_directory(save_dir, filename, as_attachment=True) +@app.route('/api/delete_old', methods=['get']) +def delete_old(): + """Delete old indexes.""" + import shutil + path = request.args.get('path') + first_dir = path.split('/')[0] + # check that path strats with indexes or vectors + if first_dir not in ['indexes', 'vectors']: + return {"status": 'error'} + shutil.rmtree(path) + vectors_collection.delete_one({'location': path}) + return {"status": 'ok'} + # handling CORS @app.after_request def after_request(response): diff --git a/application/worker.py b/application/worker.py index 2891406..5523131 100644 --- a/application/worker.py +++ b/application/worker.py @@ -10,9 +10,16 @@ from celery import current_task nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) +import string +import zipfile +import shutil +def generate_random_string(length): + return ''.join([string.ascii_letters[i % 52] for i in range(length)]) + + def ingest_worker(self, directory, formats, name_job, filename, user): - # directory = 'inputs' + # directory = 'inputs' or 'temp' # formats = [".rst", ".md"] input_files = None recursive = True @@ -21,21 +28,28 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # name_job = 'job1' # filename = 'install.rst' # user = 'local' + full_path = directory + '/' + user + '/' + name_job url = 'http://localhost:5001/api/download' file_data = {'name': name_job, 'file': filename, 'user': user} response = requests.get(url, params=file_data) file = response.content - # save in folder inputs - # create folder if not exists - if not os.path.exists(directory): - os.makedirs(directory) - with open(directory + '/' + filename, 'wb') as f: + + if not os.path.exists(full_path): + os.makedirs(full_path) + with open(full_path + '/' + filename, 'wb') as f: f.write(file) + #check if file is .zip and extract it + if filename.endswith('.zip'): + with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: + zip_ref.extractall(full_path) + os.remove(full_path + '/' + filename) + + import time self.update_state(state='PROGRESS', meta={'current': 1}) - raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=input_files, recursive=recursive, + raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, required_exts=formats, num_files_limit=limit, exclude_hidden=exclude).load_data() raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] @@ -43,15 +57,21 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # We do this due to the context limits of the LLMs. text_splitter = RecursiveCharacterTextSplitter() docs = text_splitter.split_documents(raw_docs) - call_openai_api(docs, directory, self) + call_openai_api(docs, full_path, self) self.update_state(state='PROGRESS', meta={'current': 100}) # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) url = 'http://localhost:5001/api/upload_index' file_data = {'name': name_job, 'user': user} - files = {'file_faiss': open(directory + '/index.faiss', 'rb'), - 'file_pkl': open(directory + '/index.pkl', 'rb')} + files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), + 'file_pkl': open(full_path + '/index.pkl', 'rb')} response = requests.post(url, files=files, data=file_data) - print(response.text) + + #deletes remote + url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job + response = requests.get(url) + # delete local + shutil.rmtree(full_path) + return {'directory': directory, 'formats': formats, 'name_job': name_job, 'filename': filename, 'user': user} From cb96d90563ca29fb681226cd37e649af4bbf5ae6 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Mar 2023 10:36:27 +0000 Subject: [PATCH 3/6] Update .gitignore --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 5af7552..2aec05d 100644 --- a/.gitignore +++ b/.gitignore @@ -162,3 +162,7 @@ frontend/*.sw? application/vectors/ **/inputs + +**/indexes + +**/temp \ No newline at end of file From bfb47da39875f3fb8e22318459b6e16c55ca49a7 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Mar 2023 11:34:55 +0000 Subject: [PATCH 4/6] security things --- application/app.py | 14 +++++++------- docker-compose.yaml | 11 ++++++++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/application/app.py b/application/app.py index a136cc7..d31fada 100644 --- a/application/app.py +++ b/application/app.py @@ -307,10 +307,10 @@ def upload_file(): """Upload a file to get vectorized and indexed.""" if 'user' not in request.form: return {"status": 'no user'} - user = request.form['user'] + user = secure_filename(request.form['user']) if 'name' not in request.form: return {"status": 'no name'} - job_name = request.form['name'] + job_name = secure_filename(request.form['name']) # check if the post request has the file part if 'file' not in request.files: print('No file part') @@ -350,10 +350,10 @@ def upload_index_files(): """Upload two files(index.faiss, index.pkl) to the user's folder.""" if 'user' not in request.form: return {"status": 'no user'} - user = request.form['user'] + user = secure_filename(request.form['user']) if 'name' not in request.form: return {"status": 'no name'} - job_name = request.form['name'] + job_name = secure_filename(request.form['name']) if 'file_faiss' not in request.files: print('No file part') return {"status": 'no file'} @@ -389,9 +389,9 @@ def upload_index_files(): @app.route('/api/download', methods=['get']) def download_file(): - user = request.args.get('user') - job_name = request.args.get('name') - filename = request.args.get('file') + user = secure_filename(request.args.get('user')) + job_name = secure_filename(request.args.get('name')) + filename = secure_filename(request.args.get('file')) save_dir = os.path.join(app.config['UPLOAD_FOLDER'], user, job_name) return send_from_directory(save_dir, filename, as_attachment=True) diff --git a/docker-compose.yaml b/docker-compose.yaml index 2a96868..a30ec7a 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -14,6 +14,14 @@ services: build: ./application ports: - "5001:5001" + volumes: + - app_data_container:/app + depends_on: + - redis + - mongo + worker: + build: ./application + command: celery -A app.celery worker -l info depends_on: - redis - mongo @@ -33,4 +41,5 @@ services: volumes: - mongodb_data_container: \ No newline at end of file + mongodb_data_container: + app_data_container: \ No newline at end of file From c4464455a16f923efdbc7f568e35d2c5a3cc9c80 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Mar 2023 14:29:36 +0000 Subject: [PATCH 5/6] cors + dependencies --- application/Dockerfile | 4 +++- application/app.py | 9 ++++++++- application/requirements.txt | 18 ++++++++++++++++++ application/worker.py | 25 ++++++++++++++++++++----- docker-compose.yaml | 18 ++++++++++++++++-- 5 files changed, 65 insertions(+), 9 deletions(-) diff --git a/application/Dockerfile b/application/Dockerfile index d48fc1b..c494bac 100644 --- a/application/Dockerfile +++ b/application/Dockerfile @@ -12,11 +12,13 @@ RUN pip install -r requirements.txt FROM python:3.10-slim-bullseye # Copy pre-built packages from builder stage COPY --from=builder /usr/local/lib/python3.10/site-packages/ /usr/local/lib/python3.10/site-packages/ +RUN pip install gunicorn==20.1.0 +RUN pip install celery==5.2.7 WORKDIR /app COPY . /app ENV FLASK_APP=app.py ENV FLASK_DEBUG=true -RUN pip install gunicorn==20.1.0 + EXPOSE 5001 diff --git a/application/app.py b/application/app.py index d31fada..195ef4e 100644 --- a/application/app.py +++ b/application/app.py @@ -400,10 +400,16 @@ def delete_old(): """Delete old indexes.""" import shutil path = request.args.get('path') + dirs = path.split('/') first_dir = path.split('/')[0] + for i in range(1, len(dirs)): + dirs[i] = secure_filename(dirs[i]) + + # check that path strats with indexes or vectors - if first_dir not in ['indexes', 'vectors']: + if dirs[0] not in ['indexes', 'vectors']: return {"status": 'error'} + path = '/'.join(dirs) shutil.rmtree(path) vectors_collection.delete_one({'location': path}) return {"status": 'ok'} @@ -414,6 +420,7 @@ def after_request(response): response.headers.add('Access-Control-Allow-Origin', '*') response.headers.add('Access-Control-Allow-Headers', 'Content-Type,Authorization') response.headers.add('Access-Control-Allow-Methods', 'GET,PUT,POST,DELETE,OPTIONS') + response.headers.add('Access-Control-Allow-Credentials', 'true') return response diff --git a/application/requirements.txt b/application/requirements.txt index 80e2892..f4f3539 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -3,18 +3,25 @@ aiohttp==3.8.4 aiohttp-retry==2.8.3 aiosignal==1.3.1 aleph-alpha-client==2.16.1 +amqp==5.1.1 async-timeout==4.0.2 attrs==22.2.0 +billiard==3.6.4.0 blobfile==2.0.1 boto3==1.26.84 botocore==1.29.84 cffi==1.15.1 charset-normalizer==3.1.0 click==8.1.3 +click-didyoumean==0.3.0 +click-plugins==1.1.1 +click-repl==0.2.0 cryptography==39.0.2 dataclasses-json==0.5.7 +decorator==5.1.1 deeplake==3.2.13 dill==0.3.6 +dnspython==2.3.0 ecdsa==0.18.0 entrypoints==0.4 faiss-cpu==1.7.3 @@ -29,6 +36,8 @@ idna==3.4 itsdangerous==2.1.2 Jinja2==3.1.2 jmespath==1.0.1 +joblib==1.2.0 +kombu==5.2.4 langchain==0.0.103 lxml==4.9.2 MarkupSafe==2.1.2 @@ -37,6 +46,7 @@ marshmallow-enum==1.5.1 multidict==6.0.4 multiprocess==0.70.14 mypy-extensions==1.0.0 +nltk==3.8.1 numcodecs==0.11.0 numpy==1.24.2 openai==0.27.0 @@ -45,18 +55,24 @@ pathos==0.3.0 Pillow==9.4.0 pox==0.3.2 ppft==1.7.6.6 +prompt-toolkit==3.0.38 +py==1.11.0 pyasn1==0.4.8 pycares==4.3.0 pycparser==2.21 pycryptodomex==3.17 pydantic==1.10.5 PyJWT==2.6.0 +pymongo==4.3.3 python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 +pytz==2022.7.1 PyYAML==6.0 +redis==4.5.1 regex==2022.10.31 requests==2.28.2 +retry==0.9.2 rsa==4.9 s3transfer==0.6.0 six==1.16.0 @@ -69,5 +85,7 @@ transformers==4.26.1 typing-inspect==0.8.0 typing_extensions==4.5.0 urllib3==1.26.14 +vine==5.0.0 +wcwidth==0.2.6 Werkzeug==2.2.3 yarl==1.8.2 diff --git a/application/worker.py b/application/worker.py index 5523131..1a538f7 100644 --- a/application/worker.py +++ b/application/worker.py @@ -8,11 +8,16 @@ from parser.schema.base import Document from parser.open_ai_func import call_openai_api from celery import current_task -nltk.download('punkt', quiet=True) -nltk.download('averaged_perceptron_tagger', quiet=True) + import string import zipfile import shutil + +try: + nltk.download('punkt', quiet=True) + nltk.download('averaged_perceptron_tagger', quiet=True) +except FileExistsError: + pass def generate_random_string(length): return ''.join([string.ascii_letters[i % 52] for i in range(length)]) @@ -29,7 +34,11 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # filename = 'install.rst' # user = 'local' full_path = directory + '/' + user + '/' + name_job - url = 'http://localhost:5001/api/download' + # check if API_URL env variable is set + if not os.environ.get('API_URL'): + url = 'http://localhost:5001/api/download' + else: + url = os.environ.get('API_URL') + '/api/download' file_data = {'name': name_job, 'file': filename, 'user': user} response = requests.get(url, params=file_data) file = response.content @@ -62,14 +71,20 @@ def ingest_worker(self, directory, formats, name_job, filename, user): # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl # and send them to the server (provide user and name in form) - url = 'http://localhost:5001/api/upload_index' + if not os.environ.get('API_URL'): + url = 'http://localhost:5001/api/upload_index' + else: + url = os.environ.get('API_URL') + '/api/upload_index' file_data = {'name': name_job, 'user': user} files = {'file_faiss': open(full_path + '/index.faiss', 'rb'), 'file_pkl': open(full_path + '/index.pkl', 'rb')} response = requests.post(url, files=files, data=file_data) #deletes remote - url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job + if not os.environ.get('API_URL'): + url = 'http://localhost:5001/api/delete_old?path=' + 'inputs/' + user + '/' + name_job + else: + url = os.environ.get('API_URL') + '/api/delete_old?path=' + 'inputs/' + user + '/' + name_job response = requests.get(url) # delete local shutil.rmtree(full_path) diff --git a/docker-compose.yaml b/docker-compose.yaml index a30ec7a..172b6ba 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,7 +4,7 @@ services: frontend: build: ./frontend environment: - - API_HOST=http://backend:5001 + - VITE_API_HOST=http://localhost:5001 ports: - "5173:5173" depends_on: @@ -12,6 +12,12 @@ services: backend: build: ./application + environment: + - API_KEY= + - EMBEDDINGS_KEY= + - CELERY_BROKER_URL=redis://redis:6379/0 + - CELERY_RESULT_BACKEND=redis://redis:6379/1 + - MONGO_URI=mongodb://mongo:27017/docsgpt ports: - "5001:5001" volumes: @@ -19,9 +25,17 @@ services: depends_on: - redis - mongo + worker: build: ./application - command: celery -A app.celery worker -l info + command: celery -A app.celery worker -l INFO + environment: + - API_KEY= + - EMBEDDINGS_KEY= + - CELERY_BROKER_URL=redis://redis:6379/0 + - CELERY_RESULT_BACKEND=redis://redis:6379/1 + - MONGO_URI=mongodb://mongo:27017/docsgpt + - API_URL=http://backend:5001 depends_on: - redis - mongo From 4f64738f9ea9f880ca85ef9c2ff29e5d1e7654d0 Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 14 Mar 2023 14:36:40 +0000 Subject: [PATCH 6/6] Update app.py --- application/app.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/application/app.py b/application/app.py index 195ef4e..36170d0 100644 --- a/application/app.py +++ b/application/app.py @@ -401,15 +401,13 @@ def delete_old(): import shutil path = request.args.get('path') dirs = path.split('/') - first_dir = path.split('/')[0] + dirs_clean = [] for i in range(1, len(dirs)): - dirs[i] = secure_filename(dirs[i]) - - + dirs_clean.append(secure_filename(dirs[i])) # check that path strats with indexes or vectors if dirs[0] not in ['indexes', 'vectors']: return {"status": 'error'} - path = '/'.join(dirs) + path_clean = '/'.join(dirs) shutil.rmtree(path) vectors_collection.delete_one({'location': path}) return {"status": 'ok'}