From 9228005a7e3de5ae188b5cf2b8764cdf7ba2684f Mon Sep 17 00:00:00 2001 From: Alex Date: Sun, 12 Feb 2023 16:25:01 +0000 Subject: [PATCH] chunked embedding --- scripts/parser/open_ai_func.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 500e488..00c57be 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -14,12 +14,30 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: def call_openai_api(docs): # Function to create a vector store from the documents and save it to disk. - store = FAISS.from_documents(docs, OpenAIEmbeddings()) - faiss.write_index(store.index, "docs.index") - store.index = None + from tqdm import tqdm + docs_test = [docs[0]] + # remove the first element from docs + docs.pop(0) + # cut first n docs if you want to restart + #docs = docs[:n] + c1 = 0 + store = FAISS.from_documents(docs_test, OpenAIEmbeddings()) + for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): + try: + import time + store.add_texts([i.page_content], metadatas=[i.metadata]) + except Exception as e: + print(e) + print("Error on ", i) + print("Saving progress") + print(f"stopped at {c1} out of {len(docs)}") + store.save_local("outputs") + print("Sleeping for 10 seconds and trying again") + time.sleep(10) + store.add_texts([i.page_content], metadatas=[i.metadata]) + c1 += 1 - with open("faiss_store.pkl", "wb") as f: - pickle.dump(store, f) + store.save_local("outputs") def get_user_permission(docs): # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.