feat: add support for directory list

example: `python ingest.py --dir inputs1 --dir another --dir ../inputs`, the outputs will be in `outputs/input_folder_name/`
2 years ago · b83589a308
parent 5883ce2685
commit b83589a308
2 changed files with 56 additions and 34 deletions
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@ -1,3 +1,5 @@
 from collections import defaultdict
 import os
 import sys
 import nltk
 import dotenv
@ -18,13 +20,16 @@ app = typer.Typer(add_completion=False)
 nltk.download('punkt', quiet=True)
 nltk.download('averaged_perceptron_tagger', quiet=True)
 #Splits all files in specified folder to documents
@app.command()
-def ingest(directory: Optional[str] = typer.Option("inputs",
+def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False),
-                                                   help="Path to the directory for index creation."),
+           dir: Optional[List[str]] = typer.Option(["inputs"],
-           files: Optional[List[str]] = typer.Option(None,
+                                                   help="""List of paths to directory for index creation.
-                                                   help="""File paths to use (Optional; overrides directory).
+                                                        E.g. --dir inputs --dir inputs2"""),
-                                                        E.g. --files inputs/1.md --files inputs/2.md"""),
+           file: Optional[List[str]] = typer.Option(None,
                                                   help="""File paths to use (Optional; overrides dir).
                                                        E.g. --file inputs/1.md --file inputs/2.md"""),
           recursive: Optional[bool] = typer.Option(True,
                                                   help="Whether to recursively search in subdirectories."),
           limit: Optional[int] = typer.Option(None,
@ -38,27 +43,40 @@ def ingest(directory: Optional[str] = typer.Option("inputs",
        Creates index from specified location or files.
        By default /inputs folder is used, .rst and .md are parsed.
    """
    raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive,
                                     required_exts=formats, num_files_limit=limit,
                                     exclude_hidden=exclude).load_data()
    raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
    print(raw_docs)
    # Here we split the documents, as needed, into smaller chunks.
    # We do this due to the context limits of the LLMs.
    text_splitter = RecursiveCharacterTextSplitter()
    docs = text_splitter.split_documents(raw_docs)
-    # Here we check for command line arguments for bot calls.
+    def process_one_docs(directory, folder_name):
-    # If no argument exists or the permission_bypass_flag argument is not '-y',
+        raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
-    # user permission is requested to call the API.
+                                         required_exts=formats, num_files_limit=limit,
-    if len(sys.argv) > 1:
+                                         exclude_hidden=exclude).load_data()
-        permission_bypass_flag = sys.argv[1]
+        raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
-        if permission_bypass_flag == '-y':
+        print(raw_docs)
-            call_openai_api(docs)
+        # Here we split the documents, as needed, into smaller chunks.
        # We do this due to the context limits of the LLMs.
        text_splitter = RecursiveCharacterTextSplitter()
        docs = text_splitter.split_documents(raw_docs)
        # Here we check for command line arguments for bot calls.
        # If no argument exists or the yes is not True, then the
        # user permission is requested to call the API.
        if len(sys.argv) > 1:
            if yes:
                call_openai_api(docs, folder_name)
            else:
                get_user_permission(docs, folder_name)
        else:
-            get_user_permission(docs)
+            get_user_permission(docs, folder_name)
-    else:
+
-        get_user_permission(docs)
+    folder_counts = defaultdict(int)
    folder_names = []
    for dir_path in dir:
        folder_name = os.path.basename(os.path.normpath(dir_path))
        folder_counts[folder_name] += 1
        if folder_counts[folder_name] > 1:
            folder_name = f"{folder_name}_{folder_counts[folder_name]}"
        folder_names.append(folder_name)
    for directory, folder_name in zip(dir, folder_names):
        process_one_docs(directory, folder_name)
 if __name__ == "__main__":
  app()
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@ -1,3 +1,4 @@
 import os
 import faiss
 import pickle
 import tiktoken
@ -12,8 +13,13 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
    total_price = ((num_tokens/1000) * 0.0004)
    return num_tokens, total_price
-def call_openai_api(docs):
+def call_openai_api(docs, folder_name):
 # Function to create a vector store from the documents and save it to disk.
    # create output folder if it doesn't exist
    if not os.path.exists(f"outputs/{folder_name}"):
        os.makedirs(f"outputs/{folder_name}")
    from tqdm import tqdm
    docs_test = [docs[0]]
    # remove the first element from docs
@ -31,25 +37,23 @@ def call_openai_api(docs):
            print("Error on ", i)
            print("Saving progress")
            print(f"stopped at {c1} out of {len(docs)}")
-            faiss.write_index(store.index, "docs.index")
+            faiss.write_index(store.index, f"outputs/{folder_name}/docs.index")
            store_index_bak = store.index
            store.index = None
-            with open("faiss_store.pkl", "wb") as f:
+            with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f:
                pickle.dump(store, f)
            print("Sleeping for 60 seconds and trying again")
            time.sleep(60)
            faiss.write_index(store_index_bak, "docs.index")
            store.index = store_index_bak
            store.add_texts([i.page_content], metadatas=[i.metadata])
        c1 += 1
-
+    faiss.write_index(store.index, f"outputs/{folder_name}/docs.index")
    faiss.write_index(store.index, "docs.index")
    store.index = None
-    with open("faiss_store.pkl", "wb") as f:
+    with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f:
        pickle.dump(store, f)
-def get_user_permission(docs):
+def get_user_permission(docs, folder_name):
 # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
    #docs_content = (" ".join(docs))
@ -65,8 +69,8 @@ def get_user_permission(docs):
    #Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
-        call_openai_api(docs)
+        call_openai_api(docs, folder_name)
    elif user_input == "":
-        call_openai_api(docs)
+        call_openai_api(docs, folder_name)
    else:
        print("The API was not called. No money was spent.")