feat: add file loading with multiprocessing

- Add multiprocessing to load files in parallel - Update loader mapping to handle various file types
2024-11-10 07:10:31 +00:00 · 2023-05-30 10:19:28 +03:00 · 2023-05-30 10:19:28 +03:00 · 675e7778ff
commit 675e7778ff
parent 572ee1bf4e
5 changed files with 876 additions and 113 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-## talk-codebase: tool for Chatting with Your Codebase. OpenAI, LlamaCpp, GPT4All
+## talk-codebase: Tool for chatting with your codebase and docs using OpenAI, LlamaCpp, and GPT-4-All

 [![Node.js Package](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml/badge.svg)](https://github.com/rsaryev/talk-codebase/actions/workflows/python-publish.yml)

@ -8,43 +8,73 @@

 ## Description

-Talk-codebase is a powerful tool that allows you to converse with your codebase. It uses LLMs to answer your queries.
-
-You can use [GPT4All](https://github.com/nomic-ai/gpt4all) for offline code processing without sharing your code with
-third parties. Alternatively, you can use OpenAI if privacy is not a concern for you. You can switch between these two
-options quickly and easily.
-
-Project created for educational purposes. It is not recommended to use it in production.
+Talk-codebase is a tool that allows you to converse with your codebase using LLMs to answer your queries. It supports
+offline code processing using [GPT4All](https://github.com/nomic-ai/gpt4all) without sharing your code with third
+parties, or you can use OpenAI if privacy is not a concern for you. It is only recommended for educational purposes and
+not for production use.

 ## Installation

+To install `talk-codebase`, you need to have Python 3.9 and an OpenAI API
+key [api-keys](https://platform.openai.com/account/api-keys).
+Additionally, if you want to use the GPT4All model, you need to download
+the [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) model. If you prefer a
+different model, you can download it from [GPT4All](https://gpt4all.io) and configure path to it in the configuration
+and specify its
+path in the configuration. If you want some files to be ignored, add them to .gitignore.
+
+To install `talk-codebase`, run the following command in your terminal:
+
 ```bash
 pip install talk-codebase
 ```

-## Usage
-
-Talk-codebase works only with files of popular programming languages and .txt files. All other files will be ignored.
-If you want some files to be ignored, add them to .gitignore.
+Once `talk-codebase` is installed, you can use it to chat with your codebase by running the following command:

 ```bash
-# Start chatting with your codebase
-talk-codebase chat <directory>
-
-# Configure or edit configuration ~/.config.yaml
-talk-codebase configure
-
-# Help
-talk-codebase --help
+talk-codebase chat <path-to-your-codebase>
 ```

-## Requirements
+If you need to configure or edit the configuration, you can run:

- Python 3.9
- OpenAI API key [api-keys](https://platform.openai.com/account/api-keys)
- If you want to use GPT4All, you need to download the
-  model [ggml-gpt4all-j-v1.3-groovy.bin](https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin) and specify the path
-  to it in the configuration.
+```bash
+talk-codebase configure
+```
+
+You can also edit the configuration manually by editing the `~/.config.yaml` file.
+If for some reason you cannot find the configuration file, just run the tool and at the very beginning it will output
+the path to the configuration file.
+
+```yaml
+# The OpenAI API key. You can get it from https://beta.openai.com/account/api-keys
+api_key: sk-xxx
+# maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks
+chunk_overlap: '50'
+# maximum size of a chunk
+chunk_size: '500'
+# number of samples to generate for each prompt.
+k: '4'
+# maximum tokens for the LLMs
+max_tokens: '1048'
+# token limit for the LLM model only OpenAI
+model_name: gpt-3.5-turbo
+# path to the llm file on disk.
+model_path: models/ggml-gpt4all-j-v1.3-groovy.bin
+# type of the LLM model. It can be either local or openai
+model_type: openai
+
+```
+
+## The supported extensions:
+
+- [x] `.csv`
+- [x] `.doc`
+- [x] `.docx`
+- [x] `.epub`
+- [x] `.md`
+- [x] `.pdf`
+- [x] `.txt`
+- [x] `popular programming languages`

 ## Contributing

--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "talk-codebase"
-version = "0.1.25"
+version = "0.1.26"
 description = "talk-codebase is a powerful tool for querying and analyzing codebases."
 authors = ["Saryev Rustam <rustam1997@gmail.com>"]
 readme = "README.md"
@ -20,6 +20,7 @@ gitpython = "^3.1.31"
 questionary = "^1.10.0"
 gpt4all = "^0.2.3"
 sentence-transformers = "^2.2.2"
+unstructured = "^0.6.10"


 [build-system]
--- a/talk_codebase/consts.py
+++ b/talk_codebase/consts.py
@ -1,3 +1,6 @@
+from langchain.document_loaders import CSVLoader, UnstructuredWordDocumentLoader, UnstructuredEPubLoader, \
+    PDFMinerLoader, UnstructuredMarkdownLoader, TextLoader
+
 EXCLUDE_DIRS = ['__pycache__', '.venv', '.git', '.idea', 'venv', 'env', 'node_modules', 'dist', 'build', '.vscode',
                '.github', '.gitlab']
 ALLOW_FILES = ['.txt', '.js', '.mjs', '.ts', '.tsx', '.css', '.scss', '.less', '.html', '.htm', '.json', '.py',
@ -17,3 +20,39 @@ DEFAULT_CONFIG = {
    "model_path": "models/ggml-gpt4all-j-v1.3-groovy.bin",
    "model_type": MODEL_TYPES["OPENAI"],
 }
+
+LOADER_MAPPING = {
+    ".csv": {
+        "loader": CSVLoader,
+        "args": {}
+    },
+    ".doc": {
+        "loader": UnstructuredWordDocumentLoader,
+        "args": {}
+    },
+    ".docx": {
+        "loader": UnstructuredWordDocumentLoader,
+        "args": {}
+    },
+    ".epub": {
+        "loader": UnstructuredEPubLoader,
+        "args": {}
+    },
+    ".md": {
+        "loader": UnstructuredMarkdownLoader,
+        "args": {}
+    },
+    ".pdf": {
+        "loader": PDFMinerLoader,
+        "args": {}
+    }
+}
+
+for ext in ALLOW_FILES:
+    if ext not in LOADER_MAPPING:
+        LOADER_MAPPING[ext] = {
+            "loader": TextLoader,
+            "args": {
+                "encoding": "utf8"
+            }
+        }
--- a/talk_codebase/utils.py
+++ b/talk_codebase/utils.py
@ -1,15 +1,14 @@
 import glob
+import multiprocessing
 import os
 import sys

 import tiktoken
 from git import Repo
-from halo import Halo
 from langchain import FAISS
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
-from langchain.document_loaders import TextLoader

-from talk_codebase.consts import EXCLUDE_FILES, ALLOW_FILES
+from talk_codebase.consts import LOADER_MAPPING, EXCLUDE_FILES


 def get_repo(root_dir):
@ -43,16 +42,28 @@ class StreamStdOut(StreamingStdOutCallbackHandler):


 def load_files(root_dir):
-    spinners = Halo(text='Loading files', spinner='dots').start()
-    docs = []
-    for file_path in glob.glob(os.path.join(root_dir, '**/*'), recursive=True):
-        if is_ignored(file_path, root_dir):
-            continue
-        if any(file_path.endswith(allow_file) for allow_file in ALLOW_FILES) and not any(
-                file_path.endswith(exclude_file) for exclude_file in EXCLUDE_FILES):
-            loader = TextLoader(file_path, encoding='utf-8')
-            docs.extend(loader.load_and_split())
-    spinners.succeed(f"Loaded {len(docs)} documents")
+    num_cpus = multiprocessing.cpu_count()
+    loaded_files = []
+    with multiprocessing.Pool(num_cpus) as pool:
+        futures = []
+        for file_path in glob.glob(os.path.join(root_dir, '**/*'), recursive=True):
+            if is_ignored(file_path, root_dir):
+                continue
+            if any(
+                    file_path.endswith(exclude_file) for exclude_file in EXCLUDE_FILES):
+                continue
+            for ext in LOADER_MAPPING:
+                if file_path.endswith(ext):
+                    loader = LOADER_MAPPING[ext]['loader']
+                    args = LOADER_MAPPING[ext]['args']
+                    load = loader(file_path, **args)
+                    futures.append(pool.apply_async(load.load_and_split))
+                    loaded_files.append(file_path)
+        docs = []
+        for future in futures:
+            docs.extend(future.get())
+
+    print('\n' + '\n'.join([f'📄 {os.path.abspath(file_path)}:' for file_path in loaded_files]))
    return docs