From af20c7298aa04a6b64e70347884795abb20a5ad6 Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 14 Feb 2023 19:37:07 +0400 Subject: [PATCH 1/2] new-ingest Ingest with a CLI --- scripts/ingest.py | 67 ++++++++++++++++++-------- scripts/{ => old}/ingest_rst.py | 0 scripts/{ => old}/ingest_rst_sphinx.py | 0 3 files changed, 47 insertions(+), 20 deletions(-) rename scripts/{ => old}/ingest_rst.py (100%) rename scripts/{ => old}/ingest_rst_sphinx.py (100%) diff --git a/scripts/ingest.py b/scripts/ingest.py index cebb6c3..3082cf4 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,6 +1,9 @@ import sys import nltk import dotenv +import typer + +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission dotenv.load_dotenv() -#Specify your folder HERE -directory_to_ingest = 'inputs' +app = typer.Typer(add_completion=False) -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) #Splits all files in specified folder to documents -raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data() -raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] -# Here we split the documents, as needed, into smaller chunks. -# We do this due to the context limits of the LLMs. -text_splitter = RecursiveCharacterTextSplitter() -docs = text_splitter.split_documents(raw_docs) - -# Here we check for command line arguments for bot calls. -# If no argument exists or the permission_bypass_flag argument is not '-y', -# user permission is requested to call the API. -if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) +@app.command() +def ingest(directory: Optional[str] = typer.Option("inputs", + help="Path to the directory for index creation."), + files: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides directory). + E.g. --files inputs/1.md --files inputs/2.md"""), + recursive: Optional[bool] = typer.Option(True, + help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, + help="Maximum number of files to read."), + formats: Optional[List[str]] = typer.Option([".rst", ".md"], + help="""List of required extensions (list with .) + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): + + """ + Creates index from specified location or files. + By default /inputs folder is used, .rst and .md are parsed. + """ + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the permission_bypass_flag argument is not '-y', + # user permission is requested to call the API. + if len(sys.argv) > 1: + permission_bypass_flag = sys.argv[1] + if permission_bypass_flag == '-y': + call_openai_api(docs) + else: + get_user_permission(docs) else: get_user_permission(docs) -else: - get_user_permission(docs) \ No newline at end of file + +if __name__ == "__main__": + app() diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py similarity index 100% rename from scripts/ingest_rst.py rename to scripts/old/ingest_rst.py diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py similarity index 100% rename from scripts/ingest_rst_sphinx.py rename to scripts/old/ingest_rst_sphinx.py From 7af703451918234623c30d7bf62df5957397b49e Mon Sep 17 00:00:00 2001 From: Pavel Date: Tue, 14 Feb 2023 19:41:37 +0400 Subject: [PATCH 2/2] requirements --- application/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b..7972f8c 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -60,6 +60,7 @@ tiktoken==0.1.2 tokenizers==0.13.2 tqdm==4.64.1 transformers==4.26.0 +typer==0.7.0 typing-inspect==0.8.0 typing_extensions==4.4.0 urllib3==1.26.14