feat: add cost calculation and approval prompt for vector store creation

- Add  function to calculate the cost of creating a vector store for given documents
- Prompt user to approve the cost before creating the vector store
- Exit if user does not approve the cost
pull/1/head
Saryev Rustam 1 year ago
parent 11185c079f
commit d1b0c0a796

44
poetry.lock generated

@ -844,6 +844,20 @@ files = [
{file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
]
[[package]]
name = "prompt-toolkit"
version = "3.0.38"
description = "Library for building powerful interactive command lines in Python"
optional = false
python-versions = ">=3.7.0"
files = [
{file = "prompt_toolkit-3.0.38-py3-none-any.whl", hash = "sha256:45ea77a2f7c60418850331366c81cf6b5b9cf4c7fd34616f733c5427e6abbb1f"},
{file = "prompt_toolkit-3.0.38.tar.gz", hash = "sha256:23ac5d50538a9a38c8bde05fecb47d0b403ecd0662857a86f886f798563d5b9b"},
]
[package.dependencies]
wcwidth = "*"
[[package]]
name = "pydantic"
version = "1.10.8"
@ -945,6 +959,23 @@ files = [
{file = "PyYAML-6.0.tar.gz", hash = "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2"},
]
[[package]]
name = "questionary"
version = "1.10.0"
description = "Python library to build pretty command line user prompts ⭐️"
optional = false
python-versions = ">=3.6,<4.0"
files = [
{file = "questionary-1.10.0-py3-none-any.whl", hash = "sha256:fecfcc8cca110fda9d561cb83f1e97ecbb93c613ff857f655818839dac74ce90"},
{file = "questionary-1.10.0.tar.gz", hash = "sha256:600d3aefecce26d48d97eee936fdb66e4bc27f934c3ab6dd1e292c4f43946d90"},
]
[package.dependencies]
prompt_toolkit = ">=2.0,<4.0"
[package.extras]
docs = ["Sphinx (>=3.3,<4.0)", "sphinx-autobuild (>=2020.9.1,<2021.0.0)", "sphinx-autodoc-typehints (>=1.11.1,<2.0.0)", "sphinx-copybutton (>=0.3.1,<0.4.0)", "sphinx-rtd-theme (>=0.5.0,<0.6.0)"]
[[package]]
name = "regex"
version = "2023.5.5"
@ -1308,6 +1339,17 @@ brotli = ["brotlipy (>=0.6.0)"]
secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)"]
socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[[package]]
name = "wcwidth"
version = "0.2.6"
description = "Measures the displayed width of unicode strings in a terminal"
optional = false
python-versions = "*"
files = [
{file = "wcwidth-0.2.6-py2.py3-none-any.whl", hash = "sha256:795b138f6875577cd91bba52baf9e445cd5118fd32723b460e30a0af30ea230e"},
{file = "wcwidth-0.2.6.tar.gz", hash = "sha256:a5220780a404dbe3353789870978e472cfe477761f06ee55077256e509b156d0"},
]
[[package]]
name = "yarl"
version = "1.9.2"
@ -1398,4 +1440,4 @@ multidict = ">=4.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "736cb46155cdaea6c9ea394d0a7e1da5c3c42e9db7dcd60cee73ba489ed81eec"
content-hash = "c5232c44ce381016066b186b5359b7f3690b75428daec3c8c99fa2394a069d78"

@ -1,6 +1,6 @@
[tool.poetry]
name = "talk-codebase"
version = "0.1.15"
version = "0.1.16"
description = "talk-codebase is a powerful tool for querying and analyzing codebases."
authors = ["Saryev Rustam <rustam1997@gmail.com>"]
readme = "README.md"
@ -17,6 +17,7 @@ faiss-cpu = "^1.7.4"
halo = "^0.0.31"
urllib3 = "1.26.6"
gitpython = "^3.1.31"
questionary = "^1.10.0"
[build-system]

@ -5,6 +5,7 @@ attrs==23.1.0
certifi==2023.5.7
charset-normalizer==3.1.0
colorama==0.4.6
colored==1.4.4
dataclasses-json==0.5.7
faiss-cpu==1.7.4
fire==0.5.0
@ -24,8 +25,10 @@ numpy==1.24.3
openai==0.27.7
openapi-schema-pydantic==1.2.4
packaging==23.1
prompt-toolkit==3.0.38
pydantic==1.10.8
PyYAML==6.0
questionary==1.10.0
regex==2023.5.5
requests==2.31.0
six==1.16.0
@ -39,4 +42,5 @@ tqdm==4.65.0
typing-inspect==0.9.0
typing_extensions==4.6.2
urllib3==1.26.6
wcwidth==0.2.6
yarl==1.9.2

@ -1,6 +1,7 @@
import os
from halo import Halo
import questionary
import tiktoken
from langchain import FAISS
from langchain.callbacks.manager import CallbackManager
from langchain.chains import ConversationalRetrievalChain
@ -11,7 +12,16 @@ from langchain.text_splitter import CharacterTextSplitter
from talk_codebase.utils import StreamStdOut, load_files
@Halo(text='Creating vector store', spinner='dots')
def calculate_cost(texts):
enc = tiktoken.get_encoding("cl100k_base")
all_text = ''.join([text.page_content for text in texts])
tokens = enc.encode(all_text)
token_count = len(tokens)
rate_per_thousand_tokens = 0.0004
cost = (token_count / 1000) * rate_per_thousand_tokens
return cost
def create_vector_store(root_dir, openai_api_key):
docs = load_files(root_dir)
if len(docs) == 0:
@ -20,6 +30,18 @@ def create_vector_store(root_dir, openai_api_key):
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)
cost = calculate_cost(docs)
approve = questionary.select(
f"Creating a vector store for {len(docs)} documents will cost ~${cost:.5f}. Do you want to continue?",
choices=[
{"name": "Yes", "value": True},
{"name": "No", "value": False},
]
).ask()
if not approve:
exit(0)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
db = FAISS.from_documents(texts, embeddings)

Loading…
Cancel
Save