mirror of
https://github.com/arc53/DocsGPT
synced 2024-11-17 21:26:26 +00:00
metadata on ingestion
This commit is contained in:
parent
27c45ae24a
commit
e49dd0cc6a
@ -19,6 +19,8 @@ try:
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
def metadata_from_filename(title):
|
||||
return {'title': title}
|
||||
|
||||
def generate_random_string(length):
|
||||
return ''.join([string.ascii_letters[i % 52] for i in range(length)])
|
||||
@ -59,7 +61,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
|
||||
raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive,
|
||||
required_exts=formats, num_files_limit=limit,
|
||||
exclude_hidden=exclude).load_data()
|
||||
exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
|
||||
raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
|
||||
|
||||
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
|
||||
|
@ -24,6 +24,9 @@ nltk.download('punkt', quiet=True)
|
||||
nltk.download('averaged_perceptron_tagger', quiet=True)
|
||||
|
||||
|
||||
def metadata_from_filename(title):
|
||||
return {'title': title}
|
||||
|
||||
# Splits all files in specified folder to documents
|
||||
@app.command()
|
||||
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
|
||||
@ -55,7 +58,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
|
||||
def process_one_docs(directory, folder_name):
|
||||
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
|
||||
required_exts=formats, num_files_limit=limit,
|
||||
exclude_hidden=exclude).load_data()
|
||||
exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data()
|
||||
|
||||
# Here we split the documents, as needed, into smaller chunks.
|
||||
# We do this due to the context limits of the LLMs.
|
||||
|
Loading…
Reference in New Issue
Block a user