diff --git a/scripts/ingest.py b/scripts/ingest.py index 3082cf4..6286a37 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,3 +1,5 @@ +from collections import defaultdict +import os import sys import nltk import dotenv @@ -18,13 +20,16 @@ app = typer.Typer(add_completion=False) nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) + #Splits all files in specified folder to documents @app.command() -def ingest(directory: Optional[str] = typer.Option("inputs", - help="Path to the directory for index creation."), - files: Optional[List[str]] = typer.Option(None, - help="""File paths to use (Optional; overrides directory). - E.g. --files inputs/1.md --files inputs/2.md"""), +def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False), + dir: Optional[List[str]] = typer.Option(["inputs"], + help="""List of paths to directory for index creation. + E.g. --dir inputs --dir inputs2"""), + file: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides dir). + E.g. --file inputs/1.md --file inputs/2.md"""), recursive: Optional[bool] = typer.Option(True, help="Whether to recursively search in subdirectories."), limit: Optional[int] = typer.Option(None, @@ -38,27 +43,40 @@ def ingest(directory: Optional[str] = typer.Option("inputs", Creates index from specified location or files. By default /inputs folder is used, .rst and .md are parsed. """ - raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, - required_exts=formats, num_files_limit=limit, - exclude_hidden=exclude).load_data() - raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] - print(raw_docs) - # Here we split the documents, as needed, into smaller chunks. - # We do this due to the context limits of the LLMs. - text_splitter = RecursiveCharacterTextSplitter() - docs = text_splitter.split_documents(raw_docs) - # Here we check for command line arguments for bot calls. - # If no argument exists or the permission_bypass_flag argument is not '-y', - # user permission is requested to call the API. - if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) + def process_one_docs(directory, folder_name): + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the yes is not True, then the + # user permission is requested to call the API. + if len(sys.argv) > 1: + if yes: + call_openai_api(docs, folder_name) + else: + get_user_permission(docs, folder_name) else: - get_user_permission(docs) - else: - get_user_permission(docs) + get_user_permission(docs, folder_name) + + folder_counts = defaultdict(int) + folder_names = [] + for dir_path in dir: + folder_name = os.path.basename(os.path.normpath(dir_path)) + folder_counts[folder_name] += 1 + if folder_counts[folder_name] > 1: + folder_name = f"{folder_name}_{folder_counts[folder_name]}" + folder_names.append(folder_name) + + for directory, folder_name in zip(dir, folder_names): + process_one_docs(directory, folder_name) if __name__ == "__main__": app() diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 7009132..c396600 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -1,3 +1,4 @@ +import os import faiss import pickle import tiktoken @@ -12,8 +13,13 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: total_price = ((num_tokens/1000) * 0.0004) return num_tokens, total_price -def call_openai_api(docs): +def call_openai_api(docs, folder_name): # Function to create a vector store from the documents and save it to disk. + + # create output folder if it doesn't exist + if not os.path.exists(f"outputs/{folder_name}"): + os.makedirs(f"outputs/{folder_name}") + from tqdm import tqdm docs_test = [docs[0]] # remove the first element from docs @@ -31,25 +37,23 @@ def call_openai_api(docs): print("Error on ", i) print("Saving progress") print(f"stopped at {c1} out of {len(docs)}") - faiss.write_index(store.index, "docs.index") + faiss.write_index(store.index, f"outputs/{folder_name}/docs.index") store_index_bak = store.index store.index = None - with open("faiss_store.pkl", "wb") as f: + with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f: pickle.dump(store, f) print("Sleeping for 60 seconds and trying again") time.sleep(60) - faiss.write_index(store_index_bak, "docs.index") store.index = store_index_bak store.add_texts([i.page_content], metadatas=[i.metadata]) c1 += 1 - - faiss.write_index(store.index, "docs.index") + faiss.write_index(store.index, f"outputs/{folder_name}/docs.index") store.index = None - with open("faiss_store.pkl", "wb") as f: + with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f: pickle.dump(store, f) -def get_user_permission(docs): +def get_user_permission(docs, folder_name): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. #docs_content = (" ".join(docs)) @@ -65,8 +69,8 @@ def get_user_permission(docs): #Here we check for user permission before calling the API. user_input = input("Price Okay? (Y/N) \n").lower() if user_input == "y": - call_openai_api(docs) + call_openai_api(docs, folder_name) elif user_input == "": - call_openai_api(docs) + call_openai_api(docs, folder_name) else: print("The API was not called. No money was spent.")