You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
DocsGPT/scripts/ingest.py

110 lines
4.3 KiB
Python

import os
import sys
import nltk
import dotenv
import typer
import ast
from collections import defaultdict
from pathlib import Path
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api, get_user_permission
from parser.py2doc import get_classes, get_functions, transform_to_docs
dotenv.load_dotenv()
app = typer.Typer(add_completion=False)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
#Splits all files in specified folder to documents
@app.command()
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
help="Whether to skip price confirmation"),
dir: Optional[List[str]] = typer.Option(["inputs"],
help="""List of paths to directory for index creation.
E.g. --dir inputs --dir inputs2"""),
file: Optional[List[str]] = typer.Option(None,
help="""File paths to use (Optional; overrides dir).
E.g. --file inputs/1.md --file inputs/2.md"""),
recursive: Optional[bool] = typer.Option(True,
help="Whether to recursively search in subdirectories."),
limit: Optional[int] = typer.Option(None,
help="Maximum number of files to read."),
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
help="""List of required extensions (list with .)
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html"""),
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")):
"""
Creates index from specified location or files.
By default /inputs folder is used, .rst and .md are parsed.
"""
def process_one_docs(directory, folder_name):
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
# Here we check for command line arguments for bot calls.
# If no argument exists or the yes is not True, then the
# user permission is requested to call the API.
if len(sys.argv) > 1:
if yes:
call_openai_api(docs, folder_name)
else:
get_user_permission(docs, folder_name)
else:
get_user_permission(docs, folder_name)
folder_counts = defaultdict(int)
folder_names = []
for dir_path in dir:
folder_name = os.path.basename(os.path.normpath(dir_path))
folder_counts[folder_name] += 1
if folder_counts[folder_name] > 1:
folder_name = f"{folder_name}_{folder_counts[folder_name]}"
folder_names.append(folder_name)
for directory, folder_name in zip(dir, folder_names):
process_one_docs(directory, folder_name)
@app.command()
def convert():
ps = list(Path("inputs").glob("**/*.py"))
data = []
sources = []
for p in ps:
with open(p) as f:
data.append(f.read())
sources.append(p)
functions_dict = {}
classes_dict = {}
c1 = 0
for code in data:
functions = get_functions(ast.parse(code))
source = str(sources[c1])
functions_dict[source] = functions
classes = get_classes(code)
classes_dict[source] = classes
c1 += 1
transform_to_docs(functions_dict, classes_dict)
if __name__ == "__main__":
app()