talk-codebase/talk_codebase/llm.py

import os
import time
from typing import Optional

import gpt4all
import questionary
from halo import Halo
from langchain.vectorstores import FAISS
from langchain.callbacks.manager import CallbackManager
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
from langchain.llms import LlamaCpp
from langchain.text_splitter import RecursiveCharacterTextSplitter

from talk_codebase.consts import MODEL_TYPES
from talk_codebase.utils import load_files, get_local_vector_store, calculate_cost, StreamStdOut


class BaseLLM:

    def __init__(self, root_dir, config):
        self.config = config
        self.llm = self._create_model()
        self.root_dir = root_dir
        self.vector_store = self._create_store(root_dir)

    def _create_store(self, root_dir):
        raise NotImplementedError("Subclasses must implement this method.")

    def _create_model(self):
        raise NotImplementedError("Subclasses must implement this method.")

    def embedding_search(self, query, k):
        return self.vector_store.search(query, k=k, search_type="similarity")

    def _create_vector_store(self, embeddings, index, root_dir):
        k = int(self.config.get("k"))
        index_path = os.path.join(root_dir, f"vector_store/{index}")
        new_db = get_local_vector_store(embeddings, index_path)
        if new_db is not None:
            return new_db.as_retriever(search_kwargs={"k": k})

        docs = load_files()
        if len(docs) == 0:
            print("✘ No documents found")
            exit(0)
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=int(self.config.get("chunk_size")),
                                                       chunk_overlap=int(self.config.get("chunk_overlap")))
        texts = text_splitter.split_documents(docs)
        if index == MODEL_TYPES["OPENAI"]:
            cost = calculate_cost(docs, self.config.get("openai_model_name"))
            approve = questionary.select(
                f"Creating a vector store will cost ~${cost:.5f}. Do you want to continue?",
                choices=[
                    {"name": "Yes", "value": True},
                    {"name": "No", "value": False},
                ]
            ).ask()
            if not approve:
                exit(0)

        spinners = Halo(text=f"Creating vector store", spinner='dots').start()
        db = FAISS.from_documents([texts[0]], embeddings)
        for i, text in enumerate(texts[1:]):
            spinners.text = f"Creating vector store ({i + 1}/{len(texts)})"
            db.add_documents([text])
            db.save_local(index_path)
            time.sleep(1.5)

        spinners.succeed(f"Created vector store")
        return db.as_retriever(search_kwargs={"k": k})

    def send_query(self, query):
        retriever = self._create_store(self.root_dir)
        qa = RetrievalQA.from_chain_type(
            llm=self.llm,
            chain_type="stuff",
            retriever=retriever,
            return_source_documents=True
        )
        docs = qa(query)
        file_paths = [os.path.abspath(s.metadata["source"]) for s in docs['source_documents']]
        print('\n'.join([f'📄 {file_path}:' for file_path in file_paths]))


class LocalLLM(BaseLLM):

    def _create_store(self, root_dir: str) -> Optional[FAISS]:
        embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
        return self._create_vector_store(embeddings, MODEL_TYPES["LOCAL"], root_dir)

    def _create_model(self):
        os.makedirs(self.config.get("model_path"), exist_ok=True)
        gpt4all.GPT4All.retrieve_model(model_name=self.config.get("local_model_name"),
                                       model_path=self.config.get("model_path"))
        model_path = os.path.join(self.config.get("model_path"), self.config.get("local_model_name"))
        model_n_ctx = int(self.config.get("max_tokens"))
        model_n_batch = int(self.config.get("n_batch"))
        callbacks = CallbackManager([StreamStdOut()])
        llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, n_batch=model_n_batch, callbacks=callbacks,
                       verbose=False)
        llm.client.verbose = False
        return llm


class OpenAILLM(BaseLLM):
    def _create_store(self, root_dir: str) -> Optional[FAISS]:
        embeddings = OpenAIEmbeddings(openai_api_key=self.config.get("api_key"))
        return self._create_vector_store(embeddings, MODEL_TYPES["OPENAI"], root_dir)

    def _create_model(self):
        return ChatOpenAI(model_name=self.config.get("openai_model_name"),
                          openai_api_key=self.config.get("api_key"),
                          streaming=True,
                          max_tokens=int(self.config.get("max_tokens")),
                          callback_manager=CallbackManager([StreamStdOut()]),
                          temperature=float(self.config.get("temperature")))


def factory_llm(root_dir, config):
    if config.get("model_type") == "openai":
        return OpenAILLM(root_dir, config)
    else:
        return LocalLLM(root_dir, config)
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`import os`
Refactor get_repo() and load_files() functions to use Repo() without root_dir.Refactored `load_files` added a delay when creating vector store. 1 year ago			`import time`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`from typing import Optional`

Update 1 year ago			`import gpt4all`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`import questionary`
			`from halo import Halo`
chore: update import statements in utils.py and llm.py to use langchain.vectorstores.FAISS 11 months ago			`from langchain.vectorstores import FAISS`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`from langchain.callbacks.manager import CallbackManager`
Bump version to 0.1.42 refactor BaseLLM 1 year ago			`from langchain.chains import RetrievalQA`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`from langchain.chat_models import ChatOpenAI`
			`from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings`
Update 1 year ago			`from langchain.llms import LlamaCpp`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`from langchain.text_splitter import RecursiveCharacterTextSplitter`

Merge pull request #1 from rsaryev/feat/local GPT4All 1 year ago			`from talk_codebase.consts import MODEL_TYPES`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`from talk_codebase.utils import load_files, get_local_vector_store, calculate_cost, StreamStdOut`


			`class BaseLLM:`

			`def __init__(self, root_dir, config):`
			`self.config = config`
			`self.llm = self._create_model()`
			`self.root_dir = root_dir`
			`self.vector_store = self._create_store(root_dir)`

			`def _create_store(self, root_dir):`
			`raise NotImplementedError("Subclasses must implement this method.")`

			`def _create_model(self):`
			`raise NotImplementedError("Subclasses must implement this method.")`

implement embedding_search method, and modify send_query and add temperature config 1 year ago			`def embedding_search(self, query, k):`
			`return self.vector_store.search(query, k=k, search_type="similarity")`

Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`def _create_vector_store(self, embeddings, index, root_dir):`
Bump version to 0.1.42 refactor BaseLLM 1 year ago			`k = int(self.config.get("k"))`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`index_path = os.path.join(root_dir, f"vector_store/{index}")`
			`new_db = get_local_vector_store(embeddings, index_path)`
			`if new_db is not None:`
Bump version to 0.1.42 refactor BaseLLM 1 year ago			`return new_db.as_retriever(search_kwargs={"k": k})`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago
Refactor get_repo() and load_files() functions to use Repo() without root_dir.Refactored `load_files` added a delay when creating vector store. 1 year ago			`docs = load_files()`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`if len(docs) == 0:`
			`print("✘ No documents found")`
			`exit(0)`
			`text_splitter = RecursiveCharacterTextSplitter(chunk_size=int(self.config.get("chunk_size")),`
			`chunk_overlap=int(self.config.get("chunk_overlap")))`
			`texts = text_splitter.split_documents(docs)`
Merge pull request #1 from rsaryev/feat/local GPT4All 1 year ago			`if index == MODEL_TYPES["OPENAI"]:`
Update 1 year ago			`cost = calculate_cost(docs, self.config.get("openai_model_name"))`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`approve = questionary.select(`
fix loading files bug 1 year ago			`f"Creating a vector store will cost ~${cost:.5f}. Do you want to continue?",`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`choices=[`
			`{"name": "Yes", "value": True},`
			`{"name": "No", "value": False},`
			`]`
			`).ask()`
			`if not approve:`
			`exit(0)`

fix loading files bug 1 year ago			`spinners = Halo(text=f"Creating vector store", spinner='dots').start()`
Refactor get_repo() and load_files() functions to use Repo() without root_dir.Refactored `load_files` added a delay when creating vector store. 1 year ago			`db = FAISS.from_documents([texts[0]], embeddings)`
			`for i, text in enumerate(texts[1:]):`
			`spinners.text = f"Creating vector store ({i + 1}/{len(texts)})"`
			`db.add_documents([text])`
			`db.save_local(index_path)`
			`time.sleep(1.5)`

fix loading files bug 1 year ago			`spinners.succeed(f"Created vector store")`
Bump version to 0.1.42 refactor BaseLLM 1 year ago			`return db.as_retriever(search_kwargs={"k": k})`

			`def send_query(self, query):`
			`retriever = self._create_store(self.root_dir)`
			`qa = RetrievalQA.from_chain_type(`
			`llm=self.llm,`
			`chain_type="stuff",`
			`retriever=retriever,`
			`return_source_documents=True`
			`)`
			`docs = qa(query)`
			`file_paths = [os.path.abspath(s.metadata["source"]) for s in docs['source_documents']]`
			`print('\n'.join([f'📄 {file_path}:' for file_path in file_paths]))`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago

			`class LocalLLM(BaseLLM):`

			`def _create_store(self, root_dir: str) -> Optional[FAISS]:`
			`embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')`
Merge pull request #1 from rsaryev/feat/local GPT4All 1 year ago			`return self._create_vector_store(embeddings, MODEL_TYPES["LOCAL"], root_dir)`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago
			`def _create_model(self):`
Update 1 year ago			`os.makedirs(self.config.get("model_path"), exist_ok=True)`
			`gpt4all.GPT4All.retrieve_model(model_name=self.config.get("local_model_name"),`
			`model_path=self.config.get("model_path"))`
Bump version to 0.1.42 refactor BaseLLM 1 year ago			`model_path = os.path.join(self.config.get("model_path"), self.config.get("local_model_name"))`
			`model_n_ctx = int(self.config.get("max_tokens"))`
			`model_n_batch = int(self.config.get("n_batch"))`
			`callbacks = CallbackManager([StreamStdOut()])`
Refactor get_repo() and load_files() functions to use Repo() without root_dir.Refactored `load_files` added a delay when creating vector store. 1 year ago			`llm = LlamaCpp(model_path=model_path, n_ctx=model_n_ctx, n_batch=model_n_batch, callbacks=callbacks,`
			`verbose=False)`
Add check for Python version. Add remove_model_name_local 1 year ago			`llm.client.verbose = False`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`return llm`


			`class OpenAILLM(BaseLLM):`
			`def _create_store(self, root_dir: str) -> Optional[FAISS]:`
			`embeddings = OpenAIEmbeddings(openai_api_key=self.config.get("api_key"))`
Merge pull request #1 from rsaryev/feat/local GPT4All 1 year ago			`return self._create_vector_store(embeddings, MODEL_TYPES["OPENAI"], root_dir)`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago
			`def _create_model(self):`
Update 1 year ago			`return ChatOpenAI(model_name=self.config.get("openai_model_name"),`
implement embedding_search method, and modify send_query and add temperature config 1 year ago			`openai_api_key=self.config.get("api_key"),`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago			`streaming=True,`
			`max_tokens=int(self.config.get("max_tokens")),`
implement embedding_search method, and modify send_query and add temperature config 1 year ago			`callback_manager=CallbackManager([StreamStdOut()]),`
			`temperature=float(self.config.get("temperature")))`
Refactored CLI and LLM classes - Refactored the CLI and LLM classes to improve code organization and readability. - Added a function to create an LLM instance based on the config. - Moved the function to the and classes. - Added a function to handle loading an existing vector store. - Added a function to estimate the cost of creating a vector store for OpenAI models. - Updated the function to prompt for the model type and path or API key depending on the type. - Updated the function to use the function and method of the LLM instance. - Updated the default config to include default values for and . - Added a constant to store the default config values. - Added a constant to store the default model path. 1 year ago

			`def factory_llm(root_dir, config):`
			`if config.get("model_type") == "openai":`
			`return OpenAILLM(root_dir, config)`
			`else:`
			`return LocalLLM(root_dir, config)`