Merge pull request #1 from rsaryev/feat/local

GPT4All
1 year ago · 572ee1bf4e
parent 9b9a834941 f9a31937bb
commit 572ee1bf4e
10 changed files with 1037 additions and 122 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@
 /.idea/
 /.vscode/
 /.venv/
-/talk_codebase/__pycache__/
+/talk_codebase/__pycache__/
+.DS_Store
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# talk-codebase is a powerful tool for chatting with your codebase
+## talk-codebase: tool for Chatting with Your Codebase. OpenAI, LlamaCpp, GPT4All

 [![Node.js Package](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml/badge.svg)](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml)

@ -6,6 +6,16 @@
  <img src="https://github.com/rsaryev/talk-codebase/assets/70219513/b5d338f9-14a5-417b-9690-83f5cd66facf" width="800" alt="chat">
 </p>

+## Description
+
+Talk-codebase is a powerful tool that allows you to converse with your codebase. It uses LLMs to answer your queries.
+
+You can use [GPT4All](https://github.com/nomic-ai/gpt4all) for offline code processing without sharing your code with
+third parties. Alternatively, you can use OpenAI if privacy is not a concern for you. You can switch between these two
+options quickly and easily.
+
+Project created for educational purposes. It is not recommended to use it in production.
+
 ## Installation

 ```bash
@ -14,14 +24,14 @@ pip install talk-codebase

 ## Usage

-talk-codebase works only with files of popular programming languages and additionally with .txt files. All other files
-will be ignored.
+Talk-codebase works only with files of popular programming languages and .txt files. All other files will be ignored.
+If you want some files to be ignored, add them to .gitignore.

 ```bash
 # Start chatting with your codebase
 talk-codebase chat <directory>

-# Configure
+# Configure or edit configuration ~/.config.yaml
 talk-codebase configure

 # Help
@ -31,4 +41,11 @@ talk-codebase --help
 ## Requirements

 - Python 3.9
- OpenAI API key [api-keys](https://platform.openai.com/account/api-keys)
+- OpenAI API key [api-keys](https://platform.openai.com/account/api-keys)
+- If you want to use GPT4All, you need to download the
+  model [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) and specify the path
+  to it in the configuration.
+
+## Contributing
+
+Contributions are always welcome!
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "talk-codebase"
-version = "0.1.22"
+version = "0.1.25"
 description = "talk-codebase is a powerful tool for querying and analyzing codebases."
 authors = ["Saryev Rustam <rustam1997@gmail.com>"]
 readme = "README.md"
@ -18,6 +18,8 @@ halo = "^0.0.31"
 urllib3 = "1.26.6"
 gitpython = "^3.1.31"
 questionary = "^1.10.0"
+gpt4all = "^0.2.3"
+sentence-transformers = "^2.2.2"


 [build-system]
--- a/requirements.txt
+++ b/requirements.txt
@ -4,41 +4,63 @@ async-timeout==4.0.2
 attrs==23.1.0
 certifi==2023.5.7
 charset-normalizer==3.1.0
+click==8.1.3
 colorama==0.4.6
 colored==1.4.4
 dataclasses-json==0.5.7
 faiss-cpu==1.7.4
+filelock==3.12.0
 fire==0.5.0
 frozenlist==1.3.3
+fsspec==2023.5.0
 gitdb==4.0.10
 GitPython==3.1.31
+gpt4all==0.2.3
 halo==0.0.31
+huggingface-hub==0.14.1
 idna==3.4
+Jinja2==3.1.2
+joblib==1.2.0
 langchain==0.0.181
 log-symbols==0.0.14
+MarkupSafe==2.1.2
 marshmallow==3.19.0
 marshmallow-enum==1.5.1
+mpmath==1.3.0
 multidict==6.0.4
 mypy-extensions==1.0.0
+networkx==3.1
+nltk==3.8.1
 numexpr==2.8.4
 numpy==1.24.3
 openai==0.27.7
 openapi-schema-pydantic==1.2.4
 packaging==23.1
+Pillow==9.5.0
 prompt-toolkit==3.0.38
 pydantic==1.10.8
 PyYAML==6.0
 questionary==1.10.0
 regex==2023.5.5
 requests==2.31.0
+scikit-learn==1.2.2
+scipy==1.9.3
+sentence-transformers==2.2.2
+sentencepiece==0.1.99
 six==1.16.0
 smmap==5.0.0
 spinners==0.0.24
 SQLAlchemy==2.0.15
+sympy==1.12
 tenacity==8.2.2
 termcolor==2.3.0
+threadpoolctl==3.1.0
 tiktoken==0.4.0
+tokenizers==0.13.3
+torch==2.0.1
+torchvision==0.15.2
 tqdm==4.65.0
+transformers==4.29.2
 typing-inspect==0.9.0
 typing_extensions==4.6.2
 urllib3==1.26.6
--- a/talk_codebase/LLM.py
+++ b/talk_codebase/LLM.py
@ -0,0 +1,108 @@
+import os
+from typing import Optional
+
+import questionary
+from halo import Halo
+from langchain import FAISS
+from langchain.callbacks.manager import CallbackManager
+from langchain.chains import RetrievalQA
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings import HuggingFaceEmbeddings, OpenAIEmbeddings
+from langchain.llms import GPT4All
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+from talk_codebase.consts import MODEL_TYPES
+from talk_codebase.utils import load_files, get_local_vector_store, calculate_cost, StreamStdOut
+
+
+class BaseLLM:
+
+    def __init__(self, root_dir, config):
+        self.config = config
+        self.llm = self._create_model()
+        self.root_dir = root_dir
+        self.vector_store = self._create_store(root_dir)
+
+    def _create_store(self, root_dir):
+        raise NotImplementedError("Subclasses must implement this method.")
+
+    def _create_model(self):
+        raise NotImplementedError("Subclasses must implement this method.")
+
+    def send_query(self, question):
+        k = self.config.get("k")
+        qa = RetrievalQA.from_chain_type(llm=self.llm, chain_type="stuff",
+                                         retriever=self.vector_store.as_retriever(search_kwargs={"k": int(k)}),
+                                         return_source_documents=True)
+        answer = qa(question)
+        print('\n' + '\n'.join([f'📄 {os.path.abspath(s.metadata["source"])}:' for s in answer["source_documents"]]))
+
+    def _create_vector_store(self, embeddings, index, root_dir):
+        index_path = os.path.join(root_dir, f"vector_store/{index}")
+        new_db = get_local_vector_store(embeddings, index_path)
+        if new_db is not None:
+            approve = questionary.select(
+                f"Found existing vector store. Do you want to use it?",
+                choices=[
+                    {"name": "Yes", "value": True},
+                    {"name": "No", "value": False},
+                ]
+            ).ask()
+            if approve:
+                return new_db
+
+        docs = load_files(root_dir)
+        if len(docs) == 0:
+            print("✘ No documents found")
+            exit(0)
+        text_splitter = RecursiveCharacterTextSplitter(chunk_size=int(self.config.get("chunk_size")),
+                                                       chunk_overlap=int(self.config.get("chunk_overlap")))
+        texts = text_splitter.split_documents(docs)
+        if index == MODEL_TYPES["OPENAI"]:
+            cost = calculate_cost(docs, self.config.get("model_name"))
+            approve = questionary.select(
+                f"Creating a vector store for {len(docs)} documents will cost ~${cost:.5f}. Do you want to continue?",
+                choices=[
+                    {"name": "Yes", "value": True},
+                    {"name": "No", "value": False},
+                ]
+            ).ask()
+            if not approve:
+                exit(0)
+
+        spinners = Halo(text=f"Creating vector store for {len(docs)} documents", spinner='dots').start()
+        db = FAISS.from_documents(texts, embeddings)
+        db.add_documents(texts)
+        db.save_local(index_path)
+        spinners.succeed(f"Created vector store for {len(docs)} documents")
+        return db
+
+
+class LocalLLM(BaseLLM):
+
+    def _create_store(self, root_dir: str) -> Optional[FAISS]:
+        embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')
+        return self._create_vector_store(embeddings, MODEL_TYPES["LOCAL"], root_dir)
+
+    def _create_model(self):
+        llm = GPT4All(model=self.config.get("model_path"), n_ctx=int(self.config.get("max_tokens")), streaming=True)
+        return llm
+
+
+class OpenAILLM(BaseLLM):
+    def _create_store(self, root_dir: str) -> Optional[FAISS]:
+        embeddings = OpenAIEmbeddings(openai_api_key=self.config.get("api_key"))
+        return self._create_vector_store(embeddings, MODEL_TYPES["OPENAI"], root_dir)
+
+    def _create_model(self):
+        return ChatOpenAI(model_name=self.config.get("model_name"), openai_api_key=self.config.get("api_key"),
+                          streaming=True,
+                          max_tokens=int(self.config.get("max_tokens")),
+                          callback_manager=CallbackManager([StreamStdOut()]))
+
+
+def factory_llm(root_dir, config):
+    if config.get("model_type") == "openai":
+        return OpenAILLM(root_dir, config)
+    else:
+        return LocalLLM(root_dir, config)
--- a/talk_codebase/cli.py
+++ b/talk_codebase/cli.py
@ -1,14 +1,17 @@
 import os

 import fire
+import questionary
 import yaml

-from talk_codebase.llm import create_vector_store, send_question
+from talk_codebase.LLM import factory_llm
+from talk_codebase.consts import DEFAULT_CONFIG


 def get_config():
    home_dir = os.path.expanduser("~")
    config_path = os.path.join(home_dir, ".config.yaml")
+    print(f"🤖 Loading config from {config_path}:")
    if os.path.exists(config_path):
        with open(config_path, "r") as f:
            config = yaml.safe_load(f)
@ -26,50 +29,74 @@ def save_config(config):

 def configure():
    config = get_config()
-    api_key = input("🤖 Enter your OpenAI API key: ")
-    model_name = input("🤖 Enter your model name (default: gpt-3.5-turbo): ") or "gpt-3.5-turbo"
-    config["api_key"] = api_key
-    config["model_name"] = model_name
+    model_type = questionary.select(
+        "🤖 Select model type:",
+        choices=[
+            {"name": "OpenAI", "value": "openai"},
+            {"name": "Local", "value": "local"},
+        ]
+    ).ask()
+    config["model_type"] = model_type
+    if model_type == "openai":
+        api_key = input("🤖 Enter your OpenAI API key: ")
+        model_name = input("🤖 Enter your model name (default: gpt-3.5-turbo): ")
+        config["model_name"] = model_name if model_name else DEFAULT_CONFIG["model_name"]
+        config["api_key"] = api_key
+    elif model_type == "local":
+        model_path = input(f"🤖 Enter your model path: (default: {DEFAULT_CONFIG['model_path']}) ")
+        config["model_path"] = model_path if model_path else DEFAULT_CONFIG["model_path"]
    save_config(config)
+    print("🤖 Configuration saved!")


-def loop(vector_store, api_key, model_name):
+def loop(llm):
    while True:
-        question = input("👉 ")
-        if not question:
-            print("🤖 Please enter a question.")
+        query = input("👉 ").lower().strip()
+        if not query:
+            print("🤖 Please enter a query")
            continue
-        if question.lower() in ('exit', 'quit'):
+        if query in ('exit', 'quit'):
            break
-        send_question(question, vector_store, api_key, model_name)
+        llm.send_query(query)
+
+
+def validate_config(config):
+    for key, value in DEFAULT_CONFIG.items():
+        if key not in config:
+            config[key] = value
+    if config.get("model_type") == "openai":
+        api_key = config.get("api_key")
+        if not api_key:
+            print("🤖 Please configure your API key. Use talk-codebase configure --model_type=openai")
+            exit(0)
+    elif config.get("model_type") == "local":
+        model_path = config.get("model_path")
+        if not model_path:
+            print("🤖 Please configure your model path. Use talk-codebase configure --model_type=local")
+            exit(0)
+    save_config(config)
+    return config


 def chat(root_dir):
+    config = validate_config(get_config())
+    llm = factory_llm(root_dir, config)
+    loop(llm)
+
+
+def main():
    try:
-        config = get_config()
-        api_key = config.get("api_key")
-        model_name = config.get("model_name")
-        if not (api_key and model_name):
-            configure()
-            chat(root_dir)
-        vector_store = create_vector_store(root_dir, api_key, model_name)
-        loop(vector_store, api_key, model_name)
+        fire.Fire({
+            "chat": chat,
+            "configure": configure
+        })
    except KeyboardInterrupt:
        print("\n🤖 Bye!")
    except Exception as e:
        if str(e) == "<empty message>":
-            print("🤖 Please configure your API key.")
-            configure()
-            chat(root_dir)
+            print("🤖 Please configure your API key. Use talk-codebase configure --model_type=openai")
        else:
-            print(f"\n🤖 Error: {e}")
-
-
-def main():
-    fire.Fire({
-        "chat": chat,
-        "configure": configure,
-    })
+            raise e


 if __name__ == "__main__":
--- a/talk_codebase/consts.py
+++ b/talk_codebase/consts.py
@ -4,3 +4,16 @@ ALLOW_FILES = ['.txt', '.js', '.mjs', '.ts', '.tsx', '.css', '.scss', '.less', '
               '.java', '.c', '.cpp', '.cs', '.go', '.php', '.rb', '.rs', '.swift', '.kt', '.scala', '.m', '.h',
               '.sh', '.pl', '.pm', '.lua', '.sql']
 EXCLUDE_FILES = ['requirements.txt', 'package.json', 'package-lock.json', 'yarn.lock']
+MODEL_TYPES = {
+    "OPENAI": "openai",
+    "LOCAL": "local",
+}
+DEFAULT_CONFIG = {
+    "max_tokens": "1048",
+    "chunk_size": "500",
+    "chunk_overlap": "50",
+    "k": "4",
+    "model_name": "gpt-3.5-turbo",
+    "model_path": "models/ggml-gpt4all-j-v1.3-groovy.bin",
+    "model_type": MODEL_TYPES["OPENAI"],
+}
--- a/talk_codebase/llm.py
+++ b/talk_codebase/llm.py
@ -1,81 +0,0 @@
-import os
-
-import questionary
-import tiktoken
-from halo import Halo
-from langchain import FAISS
-from langchain.callbacks.manager import CallbackManager
-from langchain.chains import ConversationalRetrievalChain
-from langchain.chat_models import ChatOpenAI
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-
-from talk_codebase.utils import StreamStdOut, load_files
-
-
-def calculate_cost(texts, model_name):
-    enc = tiktoken.encoding_for_model(model_name)
-    all_text = ''.join([text.page_content for text in texts])
-    tokens = enc.encode(all_text)
-    token_count = len(tokens)
-    cost = (token_count / 1000) * 0.0004
-    return cost
-
-
-def get_local_vector_store(embeddings):
-    try:
-        return FAISS.load_local("vector_store", embeddings)
-    except:
-        return None
-
-
-def create_vector_store(root_dir, openai_api_key, model_name):
-    embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
-    new_db = get_local_vector_store(embeddings)
-    if new_db is not None:
-        approve = questionary.select(
-            f"Found existing vector store. Do you want to use it?",
-            choices=[
-                {"name": "Yes", "value": True},
-                {"name": "No", "value": False},
-            ]
-        ).ask()
-        if approve:
-            return new_db
-
-    docs = load_files(root_dir)
-    if len(docs) == 0:
-        print("✘ No documents found")
-        exit(0)
-    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
-    texts = text_splitter.split_documents(docs)
-
-    cost = calculate_cost(docs, model_name)
-    approve = questionary.select(
-        f"Creating a vector store for {len(docs)} documents will cost ~${cost:.5f}. Do you want to continue?",
-        choices=[
-            {"name": "Yes", "value": True},
-            {"name": "No", "value": False},
-        ]
-    ).ask()
-
-    if not approve:
-        exit(0)
-
-    spinners = Halo(text='Creating vector store', spinner='dots').start()
-    db = FAISS.from_documents(texts, embeddings)
-    db.save_local("vector_store")
-    spinners.succeed(f"Created vector store with {len(docs)} documents")
-
-    return db
-
-
-def send_question(question, vector_store, openai_api_key, model_name):
-    model = ChatOpenAI(model_name=model_name, openai_api_key=openai_api_key, streaming=True,
-                       callback_manager=CallbackManager([StreamStdOut()]))
-    qa = ConversationalRetrievalChain.from_llm(model,
-                                               retriever=vector_store.as_retriever(search_kwargs={"k": 4}),
-                                               return_source_documents=True)
-    answer = qa({"question": question, "chat_history": []})
-    print('\n' + '\n'.join([f'📄 {os.path.abspath(s.metadata["source"])}:' for s in answer["source_documents"]]))
-    return answer
--- a/talk_codebase/utils.py
+++ b/talk_codebase/utils.py
@ -2,8 +2,10 @@ import glob
 import os
 import sys

+import tiktoken
 from git import Repo
 from halo import Halo
+from langchain import FAISS
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain.document_loaders import TextLoader

@ -52,3 +54,19 @@ def load_files(root_dir):
            docs.extend(loader.load_and_split())
    spinners.succeed(f"Loaded {len(docs)} documents")
    return docs
+
+
+def calculate_cost(texts, model_name):
+    enc = tiktoken.encoding_for_model(model_name)
+    all_text = ''.join([text.page_content for text in texts])
+    tokens = enc.encode(all_text)
+    token_count = len(tokens)
+    cost = (token_count / 1000) * 0.0004
+    return cost
+
+
+def get_local_vector_store(embeddings, path):
+    try:
+        return FAISS.load_local(path, embeddings)
+    except:
+        return None