feat: add file loading with multiprocessing

- Add multiprocessing to load files in parallel
- Update loader mapping to handle various file types
This commit is contained in:
Saryev Rustam 2023-05-30 10:19:28 +03:00
parent 572ee1bf4e
commit 675e7778ff
5 changed files with 876 additions and 113 deletions

View File

@ -1,4 +1,4 @@
## talk-codebase: tool for Chatting with Your Codebase. OpenAI, LlamaCpp, GPT4All
## talk-codebase: Tool for chatting with your codebase and docs using OpenAI, LlamaCpp, and GPT-4-All
[![Node.js Package](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml/badge.svg)](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml)
@ -8,43 +8,73 @@
## Description
Talk-codebase is a powerful tool that allows you to converse with your codebase. It uses LLMs to answer your queries.
You can use [GPT4All](https://github.com/nomic-ai/gpt4all) for offline code processing without sharing your code with
third parties. Alternatively, you can use OpenAI if privacy is not a concern for you. You can switch between these two
options quickly and easily.
Project created for educational purposes. It is not recommended to use it in production.
Talk-codebase is a tool that allows you to converse with your codebase using LLMs to answer your queries. It supports
offline code processing using [GPT4All](https://github.com/nomic-ai/gpt4all) without sharing your code with third
parties, or you can use OpenAI if privacy is not a concern for you. It is only recommended for educational purposes and
not for production use.
## Installation
To install `talk-codebase`, you need to have Python 3.9 and an OpenAI API
key [api-keys](https://platform.openai.com/account/api-keys).
Additionally, if you want to use the GPT4All model, you need to download
the [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) model. If you prefer a
different model, you can download it from [GPT4All](https://gpt4all.io) and configure path to it in the configuration
and specify its
path in the configuration. If you want some files to be ignored, add them to .gitignore.
To install `talk-codebase`, run the following command in your terminal:
```bash
pip install talk-codebase
```
## Usage
Talk-codebase works only with files of popular programming languages and .txt files. All other files will be ignored.
If you want some files to be ignored, add them to .gitignore.
Once `talk-codebase` is installed, you can use it to chat with your codebase by running the following command:
```bash
# Start chatting with your codebase
talk-codebase chat <directory>
# Configure or edit configuration ~/.config.yaml
talk-codebase configure
# Help
talk-codebase --help
talk-codebase chat <path-to-your-codebase>
```
## Requirements
If you need to configure or edit the configuration, you can run:
- Python 3.9
- OpenAI API key [api-keys](https://platform.openai.com/account/api-keys)
- If you want to use GPT4All, you need to download the
model [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) and specify the path
to it in the configuration.
```bash
talk-codebase configure
```
You can also edit the configuration manually by editing the `~/.config.yaml` file.
If for some reason you cannot find the configuration file, just run the tool and at the very beginning it will output
the path to the configuration file.
```yaml
# The OpenAI API key. You can get it from https://beta.openai.com/account/api-keys
api_key: sk-xxx
# maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks
chunk_overlap: '50'
# maximum size of a chunk
chunk_size: '500'
# number of samples to generate for each prompt.
k: '4'
# maximum tokens for the LLMs
max_tokens: '1048'
# token limit for the LLM model only OpenAI
model_name: gpt-3.5-turbo
# path to the llm file on disk.
model_path: models/ggml-gpt4all-j-v1.3-groovy.bin
# type of the LLM model. It can be either local or openai
model_type: openai
```
## The supported extensions:
- [x] `.csv`
- [x] `.doc`
- [x] `.docx`
- [x] `.epub`
- [x] `.md`
- [x] `.pdf`
- [x] `.txt`
- [x] `popular programming languages`
## Contributing

828
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "talk-codebase"
version = "0.1.25"
version = "0.1.26"
description = "talk-codebase is a powerful tool for querying and analyzing codebases."
authors = ["Saryev Rustam <rustam1997@gmail.com>"]
readme = "README.md"
@ -20,6 +20,7 @@ gitpython = "^3.1.31"
questionary = "^1.10.0"
gpt4all = "^0.2.3"
sentence-transformers = "^2.2.2"
unstructured = "^0.6.10"
[build-system]

View File

@ -1,3 +1,6 @@
from langchain.document_loaders import CSVLoader, UnstructuredWordDocumentLoader, UnstructuredEPubLoader, \
PDFMinerLoader, UnstructuredMarkdownLoader, TextLoader
EXCLUDE_DIRS = ['__pycache__', '.venv', '.git', '.idea', 'venv', 'env', 'node_modules', 'dist', 'build', '.vscode',
'.github', '.gitlab']
ALLOW_FILES = ['.txt', '.js', '.mjs', '.ts', '.tsx', '.css', '.scss', '.less', '.html', '.htm', '.json', '.py',
@ -17,3 +20,39 @@ DEFAULT_CONFIG = {
"model_path": "models/ggml-gpt4all-j-v1.3-groovy.bin",
"model_type": MODEL_TYPES["OPENAI"],
}
LOADER_MAPPING = {
".csv": {
"loader": CSVLoader,
"args": {}
},
".doc": {
"loader": UnstructuredWordDocumentLoader,
"args": {}
},
".docx": {
"loader": UnstructuredWordDocumentLoader,
"args": {}
},
".epub": {
"loader": UnstructuredEPubLoader,
"args": {}
},
".md": {
"loader": UnstructuredMarkdownLoader,
"args": {}
},
".pdf": {
"loader": PDFMinerLoader,
"args": {}
}
}
for ext in ALLOW_FILES:
if ext not in LOADER_MAPPING:
LOADER_MAPPING[ext] = {
"loader": TextLoader,
"args": {
"encoding": "utf8"
}
}

View File

@ -1,15 +1,14 @@
import glob
import multiprocessing
import os
import sys
import tiktoken
from git import Repo
from halo import Halo
from langchain import FAISS
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.document_loaders import TextLoader
from talk_codebase.consts import EXCLUDE_FILES, ALLOW_FILES
from talk_codebase.consts import LOADER_MAPPING, EXCLUDE_FILES
def get_repo(root_dir):
@ -43,16 +42,28 @@ class StreamStdOut(StreamingStdOutCallbackHandler):
def load_files(root_dir):
spinners = Halo(text='Loading files', spinner='dots').start()
docs = []
for file_path in glob.glob(os.path.join(root_dir, '**/*'), recursive=True):
if is_ignored(file_path, root_dir):
continue
if any(file_path.endswith(allow_file) for allow_file in ALLOW_FILES) and not any(
file_path.endswith(exclude_file) for exclude_file in EXCLUDE_FILES):
loader = TextLoader(file_path, encoding='utf-8')
docs.extend(loader.load_and_split())
spinners.succeed(f"Loaded {len(docs)} documents")
num_cpus = multiprocessing.cpu_count()
loaded_files = []
with multiprocessing.Pool(num_cpus) as pool:
futures = []
for file_path in glob.glob(os.path.join(root_dir, '**/*'), recursive=True):
if is_ignored(file_path, root_dir):
continue
if any(
file_path.endswith(exclude_file) for exclude_file in EXCLUDE_FILES):
continue
for ext in LOADER_MAPPING:
if file_path.endswith(ext):
loader = LOADER_MAPPING[ext]['loader']
args = LOADER_MAPPING[ext]['args']
load = loader(file_path, **args)
futures.append(pool.apply_async(load.load_and_split))
loaded_files.append(file_path)
docs = []
for future in futures:
docs.extend(future.get())
print('\n' + '\n'.join([f'📄 {os.path.abspath(file_path)}:' for file_path in loaded_files]))
return docs