feat: add support for directory list

example: `python ingest.py --dir inputs1 --dir another --dir ../inputs`, the outputs will be in `outputs/input_folder_name/`
2024-11-03 23:15:37 +00:00 · 2023-02-15 02:30:39 +08:00 · 2023-02-15 02:30:39 +08:00 · b83589a308
commit b83589a308
parent 5883ce2685
2 changed files with 56 additions and 34 deletions
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@ -1,3 +1,5 @@
+from collections import defaultdict
+import os
 import sys
 import nltk
 import dotenv
@ -18,13 +20,16 @@ app = typer.Typer(add_completion=False)
 nltk.download('punkt', quiet=True)
 nltk.download('averaged_perceptron_tagger', quiet=True)

+
 #Splits all files in specified folder to documents
@app.command()
-def ingest(directory: Optional[str] = typer.Option("inputs",
-                                                   help="Path to the directory for index creation."),
-           files: Optional[List[str]] = typer.Option(None,
-                                                   help="""File paths to use (Optional; overrides directory).
-                                                        E.g. --files inputs/1.md --files inputs/2.md"""),
+def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False),
+           dir: Optional[List[str]] = typer.Option(["inputs"],
+                                                   help="""List of paths to directory for index creation.
+                                                        E.g. --dir inputs --dir inputs2"""),
+           file: Optional[List[str]] = typer.Option(None,
+                                                   help="""File paths to use (Optional; overrides dir).
+                                                        E.g. --file inputs/1.md --file inputs/2.md"""),
           recursive: Optional[bool] = typer.Option(True,
                                                   help="Whether to recursively search in subdirectories."),
           limit: Optional[int] = typer.Option(None,
@ -38,27 +43,40 @@ def ingest(directory: Optional[str] = typer.Option("inputs",
        Creates index from specified location or files.
        By default /inputs folder is used, .rst and .md are parsed.
    """
-    raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive,
-                                     required_exts=formats, num_files_limit=limit,
-                                     exclude_hidden=exclude).load_data()
-    raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
-    print(raw_docs)
-    # Here we split the documents, as needed, into smaller chunks.
-    # We do this due to the context limits of the LLMs.
-    text_splitter = RecursiveCharacterTextSplitter()
-    docs = text_splitter.split_documents(raw_docs)

-    # Here we check for command line arguments for bot calls.
-    # If no argument exists or the permission_bypass_flag argument is not '-y',
-    # user permission is requested to call the API.
-    if len(sys.argv) > 1:
-        permission_bypass_flag = sys.argv[1]
-        if permission_bypass_flag == '-y':
-            call_openai_api(docs)
+    def process_one_docs(directory, folder_name):
+        raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
+                                         required_exts=formats, num_files_limit=limit,
+                                         exclude_hidden=exclude).load_data()
+        raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
+        print(raw_docs)
+        # Here we split the documents, as needed, into smaller chunks.
+        # We do this due to the context limits of the LLMs.
+        text_splitter = RecursiveCharacterTextSplitter()
+        docs = text_splitter.split_documents(raw_docs)
+
+        # Here we check for command line arguments for bot calls.
+        # If no argument exists or the yes is not True, then the
+        # user permission is requested to call the API.
+        if len(sys.argv) > 1:
+            if yes:
+                call_openai_api(docs, folder_name)
+            else:
+                get_user_permission(docs, folder_name)
        else:
-            get_user_permission(docs)
-    else:
-        get_user_permission(docs)
+            get_user_permission(docs, folder_name)
+
+    folder_counts = defaultdict(int)
+    folder_names = []
+    for dir_path in dir:
+        folder_name = os.path.basename(os.path.normpath(dir_path))
+        folder_counts[folder_name] += 1
+        if folder_counts[folder_name] > 1:
+            folder_name = f"{folder_name}_{folder_counts[folder_name]}"
+        folder_names.append(folder_name)
+
+    for directory, folder_name in zip(dir, folder_names):
+        process_one_docs(directory, folder_name)

 if __name__ == "__main__":
  app()
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@ -1,3 +1,4 @@
+import os
 import faiss
 import pickle
 import tiktoken
@ -12,8 +13,13 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
    total_price = ((num_tokens/1000) * 0.0004)
    return num_tokens, total_price

-def call_openai_api(docs):
+def call_openai_api(docs, folder_name):
 # Function to create a vector store from the documents and save it to disk.
+
+    # create output folder if it doesn't exist
+    if not os.path.exists(f"outputs/{folder_name}"):
+        os.makedirs(f"outputs/{folder_name}")
+
    from tqdm import tqdm
    docs_test = [docs[0]]
    # remove the first element from docs
@ -31,25 +37,23 @@ def call_openai_api(docs):
            print("Error on ", i)
            print("Saving progress")
            print(f"stopped at {c1} out of {len(docs)}")
-            faiss.write_index(store.index, "docs.index")
+            faiss.write_index(store.index, f"outputs/{folder_name}/docs.index")
            store_index_bak = store.index
            store.index = None
-            with open("faiss_store.pkl", "wb") as f:
+            with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f:
                pickle.dump(store, f)
            print("Sleeping for 60 seconds and trying again")
            time.sleep(60)
-            faiss.write_index(store_index_bak, "docs.index")
            store.index = store_index_bak
            store.add_texts([i.page_content], metadatas=[i.metadata])
        c1 += 1

-
-    faiss.write_index(store.index, "docs.index")
+    faiss.write_index(store.index, f"outputs/{folder_name}/docs.index")
    store.index = None
-    with open("faiss_store.pkl", "wb") as f:
+    with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f:
        pickle.dump(store, f)

-def get_user_permission(docs):
+def get_user_permission(docs, folder_name):
 # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
    #docs_content = (" ".join(docs))
@ -65,8 +69,8 @@ def get_user_permission(docs):
    #Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
-        call_openai_api(docs)
+        call_openai_api(docs, folder_name)
    elif user_input == "":
-        call_openai_api(docs)
+        call_openai_api(docs, folder_name)
    else:
        print("The API was not called. No money was spent.")