refactor: improve RAG (#676)

pull/677/head
sigoden 4 months ago committed by GitHub
parent 2f2b13c891
commit bbe08d6d81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -25,11 +25,11 @@ summarize_prompt: 'Summarize the discussion briefly in 200 words or less to use
# Text prompt used for including the summary of the entire session
summary_prompt: 'This is a summary of the chat history as a recap: '
# Define document loaders to control how `.file`/`--file` and RAG load files of specific formats.
# Define document loaders to control how RAG and `.file`/`--file` load files of specific formats.
document_loaders:
# You can add custom loaders using the following syntax:
# <file-extension>: <command-to-load-the-file>
# Note: Use `$1` for input filepath and `$2` for output filepath. If `$2` is not provided, output to stdout.
# Note: Use `$1` for input file and `$2` for output file. If `$2` is omitted, use stdout as output.
pdf: 'pdftotext $1 -' # Load .pdf file, see https://poppler.freedesktop.org
docx: 'pandoc --to plain $1' # Load .docx file
# xlsx: 'ssconvert $1 $2' # Load .xlsx file

@ -316,8 +316,14 @@ impl Rag {
self.data.chunk_overlap,
&separator,
);
let metadata = metadata
.iter()
.map(|(k, v)| format!("{k}: {v}\n"))
.collect::<Vec<String>>()
.join("");
let split_options = SplitterChunkHeaderOptions::default().with_chunk_header(&format!(
"<document_metadata>\npath: {path}</document_metadata>\n\n"
"<document_metadata>\npath: {path}\n{metadata}</document_metadata>\n\n"
));
let document = RagDocument::new(contents);
let splitted_documents = splitter.split_documents(&[document], &split_options);
@ -354,7 +360,7 @@ impl Rag {
self.data.add(next_file_id, files, document_ids, embeddings);
self.data.document_paths = document_paths;
progress(&spinner, "Building database".into());
progress(&spinner, "Building store".into());
self.hnsw = self.data.build_hnsw();
self.bm25 = self.data.build_bm25();

Loading…
Cancel
Save