From e49dd0cc6a5d86b4ac11ec3a89190e10fde25590 Mon Sep 17 00:00:00 2001 From: Alex Date: Wed, 17 May 2023 21:41:24 +0100 Subject: [PATCH] metadata on ingestion --- application/worker.py | 4 +++- scripts/ingest.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/application/worker.py b/application/worker.py index 5e47c8a9..d7674ade 100644 --- a/application/worker.py +++ b/application/worker.py @@ -19,6 +19,8 @@ try: except FileExistsError: pass +def metadata_from_filename(title): + return {'title': title} def generate_random_string(length): return ''.join([string.ascii_letters[i % 52] for i in range(length)]) @@ -59,7 +61,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): raw_docs = SimpleDirectoryReader(input_dir=full_path, input_files=input_files, recursive=recursive, required_exts=formats, num_files_limit=limit, - exclude_hidden=exclude).load_data() + exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data() raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check) docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] diff --git a/scripts/ingest.py b/scripts/ingest.py index 1aa27565..6ab9cce1 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -24,6 +24,9 @@ nltk.download('punkt', quiet=True) nltk.download('averaged_perceptron_tagger', quiet=True) +def metadata_from_filename(title): + return {'title': title} + # Splits all files in specified folder to documents @app.command() def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, @@ -55,7 +58,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, def process_one_docs(directory, folder_name): raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive, required_exts=formats, num_files_limit=limit, - exclude_hidden=exclude).load_data() + exclude_hidden=exclude, file_metadata=metadata_from_filename).load_data() # Here we split the documents, as needed, into smaller chunks. # We do this due to the context limits of the LLMs.