mirror of
https://github.com/rsaryev/talk-codebase
synced 2024-11-10 07:10:31 +00:00
feat: add file loading with multiprocessing
- Add multiprocessing to load files in parallel - Update loader mapping to handle various file types
This commit is contained in:
parent
572ee1bf4e
commit
675e7778ff
82
README.md
82
README.md
@ -1,4 +1,4 @@
|
||||
## talk-codebase: tool for Chatting with Your Codebase. OpenAI, LlamaCpp, GPT4All
|
||||
## talk-codebase: Tool for chatting with your codebase and docs using OpenAI, LlamaCpp, and GPT-4-All
|
||||
|
||||
[![Node.js Package](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml/badge.svg)](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml)
|
||||
|
||||
@ -8,43 +8,73 @@
|
||||
|
||||
## Description
|
||||
|
||||
Talk-codebase is a powerful tool that allows you to converse with your codebase. It uses LLMs to answer your queries.
|
||||
|
||||
You can use [GPT4All](https://github.com/nomic-ai/gpt4all) for offline code processing without sharing your code with
|
||||
third parties. Alternatively, you can use OpenAI if privacy is not a concern for you. You can switch between these two
|
||||
options quickly and easily.
|
||||
|
||||
Project created for educational purposes. It is not recommended to use it in production.
|
||||
Talk-codebase is a tool that allows you to converse with your codebase using LLMs to answer your queries. It supports
|
||||
offline code processing using [GPT4All](https://github.com/nomic-ai/gpt4all) without sharing your code with third
|
||||
parties, or you can use OpenAI if privacy is not a concern for you. It is only recommended for educational purposes and
|
||||
not for production use.
|
||||
|
||||
## Installation
|
||||
|
||||
To install `talk-codebase`, you need to have Python 3.9 and an OpenAI API
|
||||
key [api-keys](https://platform.openai.com/account/api-keys).
|
||||
Additionally, if you want to use the GPT4All model, you need to download
|
||||
the [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) model. If you prefer a
|
||||
different model, you can download it from [GPT4All](https://gpt4all.io) and configure path to it in the configuration
|
||||
and specify its
|
||||
path in the configuration. If you want some files to be ignored, add them to .gitignore.
|
||||
|
||||
To install `talk-codebase`, run the following command in your terminal:
|
||||
|
||||
```bash
|
||||
pip install talk-codebase
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Talk-codebase works only with files of popular programming languages and .txt files. All other files will be ignored.
|
||||
If you want some files to be ignored, add them to .gitignore.
|
||||
Once `talk-codebase` is installed, you can use it to chat with your codebase by running the following command:
|
||||
|
||||
```bash
|
||||
# Start chatting with your codebase
|
||||
talk-codebase chat <directory>
|
||||
|
||||
# Configure or edit configuration ~/.config.yaml
|
||||
talk-codebase configure
|
||||
|
||||
# Help
|
||||
talk-codebase --help
|
||||
talk-codebase chat <path-to-your-codebase>
|
||||
```
|
||||
|
||||
## Requirements
|
||||
If you need to configure or edit the configuration, you can run:
|
||||
|
||||
- Python 3.9
|
||||
- OpenAI API key [api-keys](https://platform.openai.com/account/api-keys)
|
||||
- If you want to use GPT4All, you need to download the
|
||||
model [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) and specify the path
|
||||
to it in the configuration.
|
||||
```bash
|
||||
talk-codebase configure
|
||||
```
|
||||
|
||||
You can also edit the configuration manually by editing the `~/.config.yaml` file.
|
||||
If for some reason you cannot find the configuration file, just run the tool and at the very beginning it will output
|
||||
the path to the configuration file.
|
||||
|
||||
```yaml
|
||||
# The OpenAI API key. You can get it from https://beta.openai.com/account/api-keys
|
||||
api_key: sk-xxx
|
||||
# maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks
|
||||
chunk_overlap: '50'
|
||||
# maximum size of a chunk
|
||||
chunk_size: '500'
|
||||
# number of samples to generate for each prompt.
|
||||
k: '4'
|
||||
# maximum tokens for the LLMs
|
||||
max_tokens: '1048'
|
||||
# token limit for the LLM model only OpenAI
|
||||
model_name: gpt-3.5-turbo
|
||||
# path to the llm file on disk.
|
||||
model_path: models/ggml-gpt4all-j-v1.3-groovy.bin
|
||||
# type of the LLM model. It can be either local or openai
|
||||
model_type: openai
|
||||
|
||||
```
|
||||
|
||||
## The supported extensions:
|
||||
|
||||
- [x] `.csv`
|
||||
- [x] `.doc`
|
||||
- [x] `.docx`
|
||||
- [x] `.epub`
|
||||
- [x] `.md`
|
||||
- [x] `.pdf`
|
||||
- [x] `.txt`
|
||||
- [x] `popular programming languages`
|
||||
|
||||
## Contributing
|
||||
|
||||
|
828
poetry.lock
generated
828
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "talk-codebase"
|
||||
version = "0.1.25"
|
||||
version = "0.1.26"
|
||||
description = "talk-codebase is a powerful tool for querying and analyzing codebases."
|
||||
authors = ["Saryev Rustam <rustam1997@gmail.com>"]
|
||||
readme = "README.md"
|
||||
@ -20,6 +20,7 @@ gitpython = "^3.1.31"
|
||||
questionary = "^1.10.0"
|
||||
gpt4all = "^0.2.3"
|
||||
sentence-transformers = "^2.2.2"
|
||||
unstructured = "^0.6.10"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
@ -1,3 +1,6 @@
|
||||
from langchain.document_loaders import CSVLoader, UnstructuredWordDocumentLoader, UnstructuredEPubLoader, \
|
||||
PDFMinerLoader, UnstructuredMarkdownLoader, TextLoader
|
||||
|
||||
EXCLUDE_DIRS = ['__pycache__', '.venv', '.git', '.idea', 'venv', 'env', 'node_modules', 'dist', 'build', '.vscode',
|
||||
'.github', '.gitlab']
|
||||
ALLOW_FILES = ['.txt', '.js', '.mjs', '.ts', '.tsx', '.css', '.scss', '.less', '.html', '.htm', '.json', '.py',
|
||||
@ -17,3 +20,39 @@ DEFAULT_CONFIG = {
|
||||
"model_path": "models/ggml-gpt4all-j-v1.3-groovy.bin",
|
||||
"model_type": MODEL_TYPES["OPENAI"],
|
||||
}
|
||||
|
||||
LOADER_MAPPING = {
|
||||
".csv": {
|
||||
"loader": CSVLoader,
|
||||
"args": {}
|
||||
},
|
||||
".doc": {
|
||||
"loader": UnstructuredWordDocumentLoader,
|
||||
"args": {}
|
||||
},
|
||||
".docx": {
|
||||
"loader": UnstructuredWordDocumentLoader,
|
||||
"args": {}
|
||||
},
|
||||
".epub": {
|
||||
"loader": UnstructuredEPubLoader,
|
||||
"args": {}
|
||||
},
|
||||
".md": {
|
||||
"loader": UnstructuredMarkdownLoader,
|
||||
"args": {}
|
||||
},
|
||||
".pdf": {
|
||||
"loader": PDFMinerLoader,
|
||||
"args": {}
|
||||
}
|
||||
}
|
||||
|
||||
for ext in ALLOW_FILES:
|
||||
if ext not in LOADER_MAPPING:
|
||||
LOADER_MAPPING[ext] = {
|
||||
"loader": TextLoader,
|
||||
"args": {
|
||||
"encoding": "utf8"
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,14 @@
|
||||
import glob
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
|
||||
import tiktoken
|
||||
from git import Repo
|
||||
from halo import Halo
|
||||
from langchain import FAISS
|
||||
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
|
||||
from langchain.document_loaders import TextLoader
|
||||
|
||||
from talk_codebase.consts import EXCLUDE_FILES, ALLOW_FILES
|
||||
from talk_codebase.consts import LOADER_MAPPING, EXCLUDE_FILES
|
||||
|
||||
|
||||
def get_repo(root_dir):
|
||||
@ -43,16 +42,28 @@ class StreamStdOut(StreamingStdOutCallbackHandler):
|
||||
|
||||
|
||||
def load_files(root_dir):
|
||||
spinners = Halo(text='Loading files', spinner='dots').start()
|
||||
docs = []
|
||||
for file_path in glob.glob(os.path.join(root_dir, '**/*'), recursive=True):
|
||||
if is_ignored(file_path, root_dir):
|
||||
continue
|
||||
if any(file_path.endswith(allow_file) for allow_file in ALLOW_FILES) and not any(
|
||||
file_path.endswith(exclude_file) for exclude_file in EXCLUDE_FILES):
|
||||
loader = TextLoader(file_path, encoding='utf-8')
|
||||
docs.extend(loader.load_and_split())
|
||||
spinners.succeed(f"Loaded {len(docs)} documents")
|
||||
num_cpus = multiprocessing.cpu_count()
|
||||
loaded_files = []
|
||||
with multiprocessing.Pool(num_cpus) as pool:
|
||||
futures = []
|
||||
for file_path in glob.glob(os.path.join(root_dir, '**/*'), recursive=True):
|
||||
if is_ignored(file_path, root_dir):
|
||||
continue
|
||||
if any(
|
||||
file_path.endswith(exclude_file) for exclude_file in EXCLUDE_FILES):
|
||||
continue
|
||||
for ext in LOADER_MAPPING:
|
||||
if file_path.endswith(ext):
|
||||
loader = LOADER_MAPPING[ext]['loader']
|
||||
args = LOADER_MAPPING[ext]['args']
|
||||
load = loader(file_path, **args)
|
||||
futures.append(pool.apply_async(load.load_and_split))
|
||||
loaded_files.append(file_path)
|
||||
docs = []
|
||||
for future in futures:
|
||||
docs.extend(future.get())
|
||||
|
||||
print('\n' + '\n'.join([f'📄 {os.path.abspath(file_path)}:' for file_path in loaded_files]))
|
||||
return docs
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user