From c8d8a8d0b5c3b32838ecfa59e2f646130df7c71f Mon Sep 17 00:00:00 2001 From: Pavel Date: Sun, 25 Feb 2024 16:03:18 +0300 Subject: [PATCH] Fixing ingestion metadata grouping --- .gitignore | 1 + application/parser/file/bulk.py | 18 +++++++++++++++--- application/parser/token_func.py | 15 +++++++-------- frontend/src/conversation/Conversation.tsx | 3 +-- 4 files changed, 24 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index 053e579..d7747ef 100644 --- a/.gitignore +++ b/.gitignore @@ -172,3 +172,4 @@ application/vectors/ node_modules/ .vscode/settings.json models/ +model/ diff --git a/application/parser/file/bulk.py b/application/parser/file/bulk.py index af17193..aec6c8c 100644 --- a/application/parser/file/bulk.py +++ b/application/parser/file/bulk.py @@ -147,12 +147,24 @@ class SimpleDirectoryReader(BaseReader): # do standard read with open(input_file, "r", errors=self.errors) as f: data = f.read() + # Prepare metadata for this file + if self.file_metadata is not None: + file_metadata = self.file_metadata(str(input_file)) + else: + # Provide a default empty metadata + file_metadata = {'title': '', 'store': ''} + # TODO: Find a case with no metadata and check if breaks anything + if isinstance(data, List): - data_list.extend(data) + # Extend data_list with each item in the data list + data_list.extend([str(d) for d in data]) + # For each item in the data list, add the file's metadata to metadata_list + metadata_list.extend([file_metadata for _ in data]) else: + # Add the single piece of data to data_list data_list.append(str(data)) - if self.file_metadata is not None: - metadata_list.append(self.file_metadata(str(input_file))) + # Add the file's metadata to metadata_list + metadata_list.append(file_metadata) if concatenate: return [Document("\n".join(data_list))] diff --git a/application/parser/token_func.py b/application/parser/token_func.py index 14b231f..36ae7e5 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -21,16 +21,15 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) for doc in documents: doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text)) - if current_group is None: - current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, - extra_info=doc.extra_info) - elif len(tiktoken.get_encoding("cl100k_base").encode( - current_group.text)) + doc_len < max_tokens and doc_len < min_tokens: - current_group.text += " " + doc.text + # Check if current group is empty or if the document can be added based on token count and matching metadata + if current_group is None or (len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len < min_tokens and current_group.extra_info == doc.extra_info): + if current_group is None: + current_group = doc # Use the document directly to retain its metadata + else: + current_group.text += " " + doc.text # Append text to the current group else: docs.append(current_group) - current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, - extra_info=doc.extra_info) + current_group = doc # Start a new group with the current document if current_group is not None: docs.append(current_group) diff --git a/frontend/src/conversation/Conversation.tsx b/frontend/src/conversation/Conversation.tsx index ba31144..5ed43d9 100644 --- a/frontend/src/conversation/Conversation.tsx +++ b/frontend/src/conversation/Conversation.tsx @@ -201,8 +201,7 @@ export default function Conversation() { )}

- This is a chatbot that uses the GPT-3, Faiss and LangChain to answer - questions. + DocsGPT uses GenAI, please review critial information using sources.