Update text splitter to use recursive character text splitter

- Update  to use  instead of  for splitting documents.
- This change was made to improve document splitting and increase the number of documents available for processing.
pull/1/head
Saryev Rustam 1 year ago
parent df0fc45330
commit 9b9a834941

@ -1,6 +1,6 @@
[tool.poetry]
name = "talk-codebase"
version = "0.1.21"
version = "0.1.22"
description = "talk-codebase is a powerful tool for querying and analyzing codebases."
authors = ["Saryev Rustam <rustam1997@gmail.com>"]
readme = "README.md"

@ -8,7 +8,7 @@ from langchain.callbacks.manager import CallbackManager
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from talk_codebase.utils import StreamStdOut, load_files
@ -47,7 +47,7 @@ def create_vector_store(root_dir, openai_api_key, model_name):
if len(docs) == 0:
print("✘ No documents found")
exit(0)
text_splitter = CharacterTextSplitter()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(docs)
cost = calculate_cost(docs, model_name)
@ -74,7 +74,7 @@ def send_question(question, vector_store, openai_api_key, model_name):
model = ChatOpenAI(model_name=model_name, openai_api_key=openai_api_key, streaming=True,
callback_manager=CallbackManager([StreamStdOut()]))
qa = ConversationalRetrievalChain.from_llm(model,
retriever=vector_store.as_retriever(search_kwargs={"k": 2}),
retriever=vector_store.as_retriever(search_kwargs={"k": 4}),
return_source_documents=True)
answer = qa({"question": question, "chat_history": []})
print('\n' + '\n'.join([f'📄 {os.path.abspath(s.metadata["source"])}:' for s in answer["source_documents"]]))

Loading…
Cancel
Save