Merge pull request #964 from ManishMadan2882/main

Feature: Token count for vectors
This commit is contained in:
Alex 2024-05-27 11:44:11 +01:00 committed by GitHub
commit f6c66f6ee4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 39 additions and 34 deletions

2
application/api/internal/routes.py Normal file → Executable file
View File

@ -34,6 +34,7 @@ def upload_index_files():
if "name" not in request.form:
return {"status": "no name"}
job_name = secure_filename(request.form["name"])
tokens = secure_filename(request.form["tokens"])
save_dir = os.path.join(current_dir, "indexes", user, job_name)
if settings.VECTOR_STORE == "faiss":
if "file_faiss" not in request.files:
@ -64,6 +65,7 @@ def upload_index_files():
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
"model": settings.EMBEDDINGS_NAME,
"type": "local",
"tokens": tokens
}
)
return {"status": "ok"}

View File

@ -253,6 +253,7 @@ def combined_json():
"docLink": "default",
"model": settings.EMBEDDINGS_NAME,
"location": "remote",
"tokens":""
}
]
# structure: name, language, version, description, fullName, date, docLink
@ -269,6 +270,7 @@ def combined_json():
"docLink": index["location"],
"model": settings.EMBEDDINGS_NAME,
"location": "local",
"tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
}
)
if settings.VECTOR_STORE == "faiss":
@ -290,6 +292,7 @@ def combined_json():
"docLink": "duckduck_search",
"model": settings.EMBEDDINGS_NAME,
"location": "custom",
"tokens":""
}
)
if "brave_search" in settings.RETRIEVERS_ENABLED:
@ -304,6 +307,7 @@ def combined_json():
"docLink": "brave_search",
"model": settings.EMBEDDINGS_NAME,
"location": "custom",
"tokens":""
}
)

31
application/parser/open_ai_func.py Normal file → Executable file
View File

@ -1,6 +1,5 @@
import os
import tiktoken
from application.vectorstore.vector_creator import VectorCreator
from application.core.settings import settings
from retry import retry
@ -11,14 +10,6 @@ from retry import retry
# from langchain_community.embeddings import CohereEmbeddings
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = (num_tokens / 1000) * 0.0004
return num_tokens, total_price
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata])
@ -79,25 +70,3 @@ def call_openai_api(docs, folder_name, task_status):
store.save_local(f"{folder_name}")
def get_user_permission(docs, folder_name):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content
tokens, total_price = num_tokens_from_string(
string=docs_content, encoding_name="cl100k_base"
)
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}")
# Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api(docs, folder_name)
elif user_input == "":
call_openai_api(docs, folder_name)
else:
print("The API was not called. No money was spent.")

31
application/worker.py Normal file → Executable file
View File

@ -2,6 +2,7 @@ import os
import shutil
import string
import zipfile
import tiktoken
from urllib.parse import urljoin
import requests
@ -131,6 +132,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
call_openai_api(docs, full_path, self)
tokens = count_tokens_docs(docs)
self.update_state(state="PROGRESS", meta={"current": 100})
if sample:
@ -139,7 +141,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
# and send them to the server (provide user and name in form)
file_data = {"name": name_job, "user": user}
file_data = {"name": name_job, "user": user, "tokens":tokens}
if settings.VECTOR_STORE == "faiss":
files = {
"file_faiss": open(full_path + "/index.faiss", "rb"),
@ -188,18 +190,19 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
max_tokens=max_tokens,
token_check=token_check,
)
# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
call_openai_api(docs, full_path, self)
tokens = count_tokens_docs(docs)
self.update_state(state="PROGRESS", meta={"current": 100})
# Proceed with uploading and cleaning as in the original function
file_data = {"name": name_job, "user": user}
file_data = {"name": name_job, "user": user, "tokens":tokens}
if settings.VECTOR_STORE == "faiss":
files = {
"file_faiss": open(full_path + "/index.faiss", "rb"),
"file_pkl": open(full_path + "/index.pkl", "rb"),
}
requests.post(
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
)
@ -210,3 +213,25 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
shutil.rmtree(full_path)
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
def count_tokens_docs(docs):
# Here we convert the docs list to a string and calculate the number of tokens the string represents.
# docs_content = (" ".join(docs))
docs_content = ""
for doc in docs:
docs_content += doc.page_content
tokens, total_price = num_tokens_from_string(
string=docs_content, encoding_name="cl100k_base"
)
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
return tokens
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
total_price = (num_tokens / 1000) * 0.0004
return num_tokens, total_price

View File

@ -13,6 +13,7 @@ export type Doc = {
date: string;
docLink: string;
model: string;
tokens?: string;
};
export type PromptProps = {

View File

@ -14,6 +14,7 @@ const Documents: React.FC<DocumentsProps> = ({
<tr>
<th className="border-r p-4 md:w-[244px]">Document Name</th>
<th className="w-[244px] border-r px-4 py-2">Vector Date</th>
<th className="w-[244px] border-r px-4 py-2">Token usage</th>
<th className="w-[244px] border-r px-4 py-2">Type</th>
<th className="px-4 py-2"></th>
</tr>
@ -28,6 +29,9 @@ const Documents: React.FC<DocumentsProps> = ({
<td className="border-r border-t px-4 py-2">
{document.date}
</td>
<td className="border-r border-t px-4 py-2">
{document.tokens ? document.tokens : ''}
</td>
<td className="border-r border-t px-4 py-2">
{document.location === 'remote'
? 'Pre-loaded'