diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b..7972f8c 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -60,6 +60,7 @@ tiktoken==0.1.2 tokenizers==0.13.2 tqdm==4.64.1 transformers==4.26.0 +typer==0.7.0 typing-inspect==0.8.0 typing_extensions==4.4.0 urllib3==1.26.14 diff --git a/scripts/ingest.py b/scripts/ingest.py index cebb6c3..3082cf4 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,6 +1,9 @@ import sys import nltk import dotenv +import typer + +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission dotenv.load_dotenv() -#Specify your folder HERE -directory_to_ingest = 'inputs' +app = typer.Typer(add_completion=False) -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) #Splits all files in specified folder to documents -raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data() -raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] -# Here we split the documents, as needed, into smaller chunks. -# We do this due to the context limits of the LLMs. -text_splitter = RecursiveCharacterTextSplitter() -docs = text_splitter.split_documents(raw_docs) - -# Here we check for command line arguments for bot calls. -# If no argument exists or the permission_bypass_flag argument is not '-y', -# user permission is requested to call the API. -if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) +@app.command() +def ingest(directory: Optional[str] = typer.Option("inputs", + help="Path to the directory for index creation."), + files: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides directory). + E.g. --files inputs/1.md --files inputs/2.md"""), + recursive: Optional[bool] = typer.Option(True, + help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, + help="Maximum number of files to read."), + formats: Optional[List[str]] = typer.Option([".rst", ".md"], + help="""List of required extensions (list with .) + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): + + """ + Creates index from specified location or files. + By default /inputs folder is used, .rst and .md are parsed. + """ + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the permission_bypass_flag argument is not '-y', + # user permission is requested to call the API. + if len(sys.argv) > 1: + permission_bypass_flag = sys.argv[1] + if permission_bypass_flag == '-y': + call_openai_api(docs) + else: + get_user_permission(docs) else: get_user_permission(docs) -else: - get_user_permission(docs) \ No newline at end of file + +if __name__ == "__main__": + app() diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py similarity index 100% rename from scripts/ingest_rst.py rename to scripts/old/ingest_rst.py diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py similarity index 100% rename from scripts/ingest_rst_sphinx.py rename to scripts/old/ingest_rst_sphinx.py