mirror of
https://github.com/arc53/DocsGPT
synced 2024-11-17 21:26:26 +00:00
Merge pull request #964 from ManishMadan2882/main
Feature: Token count for vectors
This commit is contained in:
commit
f6c66f6ee4
2
application/api/internal/routes.py
Normal file → Executable file
2
application/api/internal/routes.py
Normal file → Executable file
@ -34,6 +34,7 @@ def upload_index_files():
|
||||
if "name" not in request.form:
|
||||
return {"status": "no name"}
|
||||
job_name = secure_filename(request.form["name"])
|
||||
tokens = secure_filename(request.form["tokens"])
|
||||
save_dir = os.path.join(current_dir, "indexes", user, job_name)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
if "file_faiss" not in request.files:
|
||||
@ -64,6 +65,7 @@ def upload_index_files():
|
||||
"date": datetime.datetime.now().strftime("%d/%m/%Y %H:%M:%S"),
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"type": "local",
|
||||
"tokens": tokens
|
||||
}
|
||||
)
|
||||
return {"status": "ok"}
|
@ -253,6 +253,7 @@ def combined_json():
|
||||
"docLink": "default",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "remote",
|
||||
"tokens":""
|
||||
}
|
||||
]
|
||||
# structure: name, language, version, description, fullName, date, docLink
|
||||
@ -269,6 +270,7 @@ def combined_json():
|
||||
"docLink": index["location"],
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "local",
|
||||
"tokens" : index["tokens"] if ("tokens" in index.keys()) else ""
|
||||
}
|
||||
)
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
@ -290,6 +292,7 @@ def combined_json():
|
||||
"docLink": "duckduck_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens":""
|
||||
}
|
||||
)
|
||||
if "brave_search" in settings.RETRIEVERS_ENABLED:
|
||||
@ -304,6 +307,7 @@ def combined_json():
|
||||
"docLink": "brave_search",
|
||||
"model": settings.EMBEDDINGS_NAME,
|
||||
"location": "custom",
|
||||
"tokens":""
|
||||
}
|
||||
)
|
||||
|
||||
|
31
application/parser/open_ai_func.py
Normal file → Executable file
31
application/parser/open_ai_func.py
Normal file → Executable file
@ -1,6 +1,5 @@
|
||||
import os
|
||||
|
||||
import tiktoken
|
||||
from application.vectorstore.vector_creator import VectorCreator
|
||||
from application.core.settings import settings
|
||||
from retry import retry
|
||||
@ -11,14 +10,6 @@ from retry import retry
|
||||
# from langchain_community.embeddings import CohereEmbeddings
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = (num_tokens / 1000) * 0.0004
|
||||
return num_tokens, total_price
|
||||
|
||||
|
||||
@retry(tries=10, delay=60)
|
||||
def store_add_texts_with_retry(store, i):
|
||||
store.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
@ -79,25 +70,3 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
store.save_local(f"{folder_name}")
|
||||
|
||||
|
||||
def get_user_permission(docs, folder_name):
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
|
||||
# docs_content = (" ".join(docs))
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens, total_price = num_tokens_from_string(
|
||||
string=docs_content, encoding_name="cl100k_base"
|
||||
)
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
print(f"Number of Tokens = {format(tokens, ',d')}")
|
||||
print(f"Approx Cost = ${format(total_price, ',.2f')}")
|
||||
# Here we check for user permission before calling the API.
|
||||
user_input = input("Price Okay? (Y/N) \n").lower()
|
||||
if user_input == "y":
|
||||
call_openai_api(docs, folder_name)
|
||||
elif user_input == "":
|
||||
call_openai_api(docs, folder_name)
|
||||
else:
|
||||
print("The API was not called. No money was spent.")
|
||||
|
31
application/worker.py
Normal file → Executable file
31
application/worker.py
Normal file → Executable file
@ -2,6 +2,7 @@ import os
|
||||
import shutil
|
||||
import string
|
||||
import zipfile
|
||||
import tiktoken
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
@ -131,6 +132,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
|
||||
call_openai_api(docs, full_path, self)
|
||||
tokens = count_tokens_docs(docs)
|
||||
self.update_state(state="PROGRESS", meta={"current": 100})
|
||||
|
||||
if sample:
|
||||
@ -139,7 +141,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
# get files from outputs/inputs/index.faiss and outputs/inputs/index.pkl
|
||||
# and send them to the server (provide user and name in form)
|
||||
file_data = {"name": name_job, "user": user}
|
||||
file_data = {"name": name_job, "user": user, "tokens":tokens}
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {
|
||||
"file_faiss": open(full_path + "/index.faiss", "rb"),
|
||||
@ -188,18 +190,19 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
max_tokens=max_tokens,
|
||||
token_check=token_check,
|
||||
)
|
||||
|
||||
# docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
call_openai_api(docs, full_path, self)
|
||||
tokens = count_tokens_docs(docs)
|
||||
self.update_state(state="PROGRESS", meta={"current": 100})
|
||||
|
||||
# Proceed with uploading and cleaning as in the original function
|
||||
file_data = {"name": name_job, "user": user}
|
||||
file_data = {"name": name_job, "user": user, "tokens":tokens}
|
||||
if settings.VECTOR_STORE == "faiss":
|
||||
files = {
|
||||
"file_faiss": open(full_path + "/index.faiss", "rb"),
|
||||
"file_pkl": open(full_path + "/index.pkl", "rb"),
|
||||
}
|
||||
|
||||
requests.post(
|
||||
urljoin(settings.API_URL, "/api/upload_index"), files=files, data=file_data
|
||||
)
|
||||
@ -210,3 +213,25 @@ def remote_worker(self, source_data, name_job, user, loader, directory="temp"):
|
||||
shutil.rmtree(full_path)
|
||||
|
||||
return {"urls": source_data, "name_job": name_job, "user": user, "limited": False}
|
||||
|
||||
|
||||
def count_tokens_docs(docs):
|
||||
# Here we convert the docs list to a string and calculate the number of tokens the string represents.
|
||||
# docs_content = (" ".join(docs))
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
tokens, total_price = num_tokens_from_string(
|
||||
string=docs_content, encoding_name="cl100k_base"
|
||||
)
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
return tokens
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = (num_tokens / 1000) * 0.0004
|
||||
return num_tokens, total_price
|
@ -13,6 +13,7 @@ export type Doc = {
|
||||
date: string;
|
||||
docLink: string;
|
||||
model: string;
|
||||
tokens?: string;
|
||||
};
|
||||
|
||||
export type PromptProps = {
|
||||
|
@ -14,6 +14,7 @@ const Documents: React.FC<DocumentsProps> = ({
|
||||
<tr>
|
||||
<th className="border-r p-4 md:w-[244px]">Document Name</th>
|
||||
<th className="w-[244px] border-r px-4 py-2">Vector Date</th>
|
||||
<th className="w-[244px] border-r px-4 py-2">Token usage</th>
|
||||
<th className="w-[244px] border-r px-4 py-2">Type</th>
|
||||
<th className="px-4 py-2"></th>
|
||||
</tr>
|
||||
@ -28,6 +29,9 @@ const Documents: React.FC<DocumentsProps> = ({
|
||||
<td className="border-r border-t px-4 py-2">
|
||||
{document.date}
|
||||
</td>
|
||||
<td className="border-r border-t px-4 py-2">
|
||||
{document.tokens ? document.tokens : ''}
|
||||
</td>
|
||||
<td className="border-r border-t px-4 py-2">
|
||||
{document.location === 'remote'
|
||||
? 'Pre-loaded'
|
||||
|
Loading…
Reference in New Issue
Block a user