Merge pull request #964 from ManishMadan2882/main

Feature: Token count for vectors
2024-11-17 21:26:26 +00:00 · 2024-05-27 11:44:11 +01:00 · 2024-05-27 11:44:11 +01:00 · f6c66f6ee4
commit f6c66f6ee4
parent 425803a1b6 c794ea614a
6 changed files with 39 additions and 34 deletions
--- a/application/api/internal/routes.py
+++ b/application/api/internal/routes.py
@ -34,6 +34,7 @@ def upload_index_files():
    if "name" not in request.form:
        return {"status": "no name"}
    job_name = secure_filename(request.form["name"])
+    tokens = secure_filename(request.form["tokens"])
    save_dir = os.path.join(current_dir, "indexes", user, job_name)
    if settings.VECTOR_STORE == "faiss":
        if "file_faiss" not in request.files:
@ -64,6 +65,7 @@ def upload_index_files():
            "date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
            "model": settings.EMBEDDINGS_NAME,
            "type": "local",
+            "tokens": tokens
        }
    )
    return {"status": "ok"}
--- a/application/api/user/routes.py
+++ b/application/api/user/routes.py
@ -253,6 +253,7 @@ def combined_json():
            "docLink": "default",
            "model": settings.EMBEDDINGS_NAME,
            "location": "remote",
+            "tokens":""
        }
    ]
    # structure: name, language, version, description, fullName, date, docLink
@ -269,6 +270,7 @@ def combined_json():
                "docLink": index["location"],
                "model": settings.EMBEDDINGS_NAME,
                "location": "local",
+                "tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
            }
        )
    if settings.VECTOR_STORE == "faiss":
@ -290,6 +292,7 @@ def combined_json():
                "docLink": "duckduck_search",
                "model": settings.EMBEDDINGS_NAME,
                "location": "custom",
+                "tokens":""
            }
        )
    if "brave_search" in settings.RETRIEVERS_ENABLED:
@ -304,6 +307,7 @@ def combined_json():
                "docLink": "brave_search",
                "model": settings.EMBEDDINGS_NAME,
                "location": "custom",
+                "tokens":""
            }
        )

--- a/application/parser/open_ai_func.py
+++ b/application/parser/open_ai_func.py
@ -1,6 +1,5 @@
 import os

-import tiktoken
 from application.vectorstore.vector_creator import VectorCreator
 from application.core.settings import settings
 from retry import retry
@ -11,14 +10,6 @@ from retry import retry
 # from langchain_community.embeddings import CohereEmbeddings


-def num_tokens_from_string(string: str, encoding_name: str) -> int:
-    # Function to convert string to tokens and estimate user cost.
-    encoding = tiktoken.get_encoding(encoding_name)
-    num_tokens = len(encoding.encode(string))
-    total_price = (num_tokens / 1000) * 0.0004
-    return num_tokens, total_price
-
-
@retry(tries=10, delay=60)
 def store_add_texts_with_retry(store, i):
    store.add_texts([i.page_content], metadatas=[i.metadata])
@ -79,25 +70,3 @@ def call_openai_api(docs, folder_name, task_status):
        store.save_local(f"{folder_name}")


-def get_user_permission(docs, folder_name):
-    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
-    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    # docs_content = (" ".join(docs))
-    docs_content = ""
-    for doc in docs:
-        docs_content += doc.page_content
-
-    tokens, total_price = num_tokens_from_string(
-        string=docs_content, encoding_name="cl100k_base"
-    )
-    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
-    print(f"Number of Tokens = {format(tokens, ',d')}")
-    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    # Here we check for user permission before calling the API.
-    user_input = input("Price Okay? (Y/N) \n").lower()
-    if user_input == "y":
-        call_openai_api(docs, folder_name)
-    elif user_input == "":
-        call_openai_api(docs, folder_name)
-    else:
-        print("The API was not called. No money was spent.")
--- a/application/worker.py
+++ b/application/worker.py
@ -2,6 +2,7 @@ import os
 import shutil
 import string
 import zipfile
+import tiktoken
 from urllib.parse import urljoin

 import requests
@ -131,6 +132,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

    call_openai_api(docs, full_path, self)
+    tokens = count_tokens_docs(docs)
    self.update_state(state="PROGRESS", meta={"current": 100})

    if sample:
@ -139,7 +141,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):

    # get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
    # and send them to the server (provide user and name in form)
-    file_data = {"name": name_job, "user": user}
+    file_data = {"name": name_job, "user": user, "tokens":tokens}
    if settings.VECTOR_STORE == "faiss":
        files = {
            "file_faiss": open(full_path + "/index.faiss", "rb"),
@ -188,18 +190,19 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
        max_tokens=max_tokens,
        token_check=token_check,
    )
-
    # docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
    call_openai_api(docs, full_path, self)
+    tokens = count_tokens_docs(docs)
    self.update_state(state="PROGRESS", meta={"current": 100})

    # Proceed with uploading and cleaning as in the original function
-    file_data = {"name": name_job, "user": user}
+    file_data = {"name": name_job, "user": user, "tokens":tokens}
    if settings.VECTOR_STORE == "faiss":
        files = {
            "file_faiss": open(full_path + "/index.faiss", "rb"),
            "file_pkl": open(full_path + "/index.pkl", "rb"),
        }
+        
        requests.post(
            urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
        )
@ -210,3 +213,25 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
    shutil.rmtree(full_path)

    return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
+
+
+def count_tokens_docs(docs):
+    # Here we convert the docs list to a string and calculate the number of tokens the string represents.
+    # docs_content = (" ".join(docs))
+    docs_content = ""
+    for doc in docs:
+        docs_content += doc.page_content
+
+    tokens, total_price = num_tokens_from_string(
+        string=docs_content, encoding_name="cl100k_base"
+    )
+    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
+    return tokens
+
+
+def num_tokens_from_string(string: str, encoding_name: str) -> int:
+    # Function to convert string to tokens and estimate user cost.
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    total_price = (num_tokens / 1000) * 0.0004
+    return num_tokens, total_price
--- a/frontend/src/models/misc.ts
+++ b/frontend/src/models/misc.ts
@ -13,6 +13,7 @@ export type Doc = {
  date: string;
  docLink: string;
  model: string;
+  tokens?: string;
 };

 export type PromptProps = {
--- a/frontend/src/settings/Documents.tsx
+++ b/frontend/src/settings/Documents.tsx
@ -14,6 +14,7 @@ const Documents: React.FC<DocumentsProps> = ({
              <tr>
                <th className="border-r p-4 md:w-[244px]">Document Name</th>
                <th className="w-[244px] border-r px-4 py-2">Vector Date</th>
+                <th className="w-[244px] border-r px-4 py-2">Token usage</th>
                <th className="w-[244px] border-r px-4 py-2">Type</th>
                <th className="px-4 py-2"></th>
              </tr>
@ -28,6 +29,9 @@ const Documents: React.FC<DocumentsProps> = ({
                    <td className="border-r border-t px-4 py-2">
                      {document.date}
                    </td>
+                    <td className="border-r border-t px-4 py-2">
+                      {document.tokens ? document.tokens : ''}
+                    </td>
                    <td className="border-r border-t px-4 py-2">
                      {document.location === 'remote'
                        ? 'Pre-loaded'