refactor: improve RAG (#676)

pull/677/head
sigoden 4 months ago committed by GitHub
parent 2f2b13c891
commit bbe08d6d81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -25,11 +25,11 @@ summarize_prompt: 'Summarize the discussion briefly in 200 words or less to use
# Text prompt used for including the summary of the entire session # Text prompt used for including the summary of the entire session
summary_prompt: 'This is a summary of the chat history as a recap: ' summary_prompt: 'This is a summary of the chat history as a recap: '
# Define document loaders to control how `.file`/`--file` and RAG load files of specific formats. # Define document loaders to control how RAG and `.file`/`--file` load files of specific formats.
document_loaders: document_loaders:
# You can add custom loaders using the following syntax: # You can add custom loaders using the following syntax:
# <file-extension>: <command-to-load-the-file> # <file-extension>: <command-to-load-the-file>
# Note: Use `$1` for input filepath and `$2` for output filepath. If `$2` is not provided, output to stdout. # Note: Use `$1` for input file and `$2` for output file. If `$2` is omitted, use stdout as output.
pdf: 'pdftotext $1 -' # Load .pdf file, see https://poppler.freedesktop.org pdf: 'pdftotext $1 -' # Load .pdf file, see https://poppler.freedesktop.org
docx: 'pandoc --to plain $1' # Load .docx file docx: 'pandoc --to plain $1' # Load .docx file
# xlsx: 'ssconvert $1 $2' # Load .xlsx file # xlsx: 'ssconvert $1 $2' # Load .xlsx file

@ -316,8 +316,14 @@ impl Rag {
self.data.chunk_overlap, self.data.chunk_overlap,
&separator, &separator,
); );
let metadata = metadata
.iter()
.map(|(k, v)| format!("{k}: {v}\n"))
.collect::<Vec<String>>()
.join("");
let split_options = SplitterChunkHeaderOptions::default().with_chunk_header(&format!( let split_options = SplitterChunkHeaderOptions::default().with_chunk_header(&format!(
"<document_metadata>\npath: {path}</document_metadata>\n\n" "<document_metadata>\npath: {path}\n{metadata}</document_metadata>\n\n"
)); ));
let document = RagDocument::new(contents); let document = RagDocument::new(contents);
let splitted_documents = splitter.split_documents(&[document], &split_options); let splitted_documents = splitter.split_documents(&[document], &split_options);
@ -354,7 +360,7 @@ impl Rag {
self.data.add(next_file_id, files, document_ids, embeddings); self.data.add(next_file_id, files, document_ids, embeddings);
self.data.document_paths = document_paths; self.data.document_paths = document_paths;
progress(&spinner, "Building database".into()); progress(&spinner, "Building store".into());
self.hnsw = self.data.build_hnsw(); self.hnsw = self.data.build_hnsw();
self.bm25 = self.data.build_bm25(); self.bm25 = self.data.build_bm25();

Loading…
Cancel
Save