refactor: rag chunk contains path metadata (#647)

This commit is contained in:
sigoden 2024-06-25 12:55:38 +08:00 committed by GitHub
parent 2bc9607b00
commit 9f19108a71
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 9 additions and 17 deletions

View File

@ -262,8 +262,10 @@ impl Rag {
); );
let documents = load(&path, &extension) let documents = load(&path, &extension)
.with_context(|| format!("Failed to load file at '{path}'"))?; .with_context(|| format!("Failed to load file at '{path}'"))?;
let documents = let split_options = SplitterChunkHeaderOptions::default().with_chunk_header(&format!(
splitter.split_documents(&documents, &SplitterChunkHeaderOptions::default()); "<document_metadata>\npath: {path}\n</document_metadata>\n\n"
));
let documents = splitter.split_documents(&documents, &split_options);
rag_files.push(RagFile { path, documents }); rag_files.push(RagFile { path, documents });
progress( progress(
&progress_tx, &progress_tx,

View File

@ -106,7 +106,6 @@ impl RecursiveCharacterTextSplitter {
let SplitterChunkHeaderOptions { let SplitterChunkHeaderOptions {
chunk_header, chunk_header,
chunk_overlap_header, chunk_overlap_header,
append_chunk_overlap_header,
} = chunk_header_options; } = chunk_header_options;
let mut documents = Vec::new(); let mut documents = Vec::new();
@ -144,7 +143,7 @@ impl RecursiveCharacterTextSplitter {
Ordering::Equal => {} Ordering::Equal => {}
} }
if *append_chunk_overlap_header { if let Some(chunk_overlap_header) = chunk_overlap_header {
page_content += chunk_overlap_header; page_content += chunk_overlap_header;
} }
} }
@ -288,16 +287,14 @@ impl RecursiveCharacterTextSplitter {
pub struct SplitterChunkHeaderOptions { pub struct SplitterChunkHeaderOptions {
pub chunk_header: String, pub chunk_header: String,
pub chunk_overlap_header: String, pub chunk_overlap_header: Option<String>,
pub append_chunk_overlap_header: bool,
} }
impl Default for SplitterChunkHeaderOptions { impl Default for SplitterChunkHeaderOptions {
fn default() -> Self { fn default() -> Self {
Self { Self {
chunk_header: "".into(), chunk_header: "".into(),
chunk_overlap_header: "(cont'd) ".into(), chunk_overlap_header: None,
append_chunk_overlap_header: false,
} }
} }
} }
@ -313,14 +310,7 @@ impl SplitterChunkHeaderOptions {
// Set the value of chunk_overlap_header // Set the value of chunk_overlap_header
#[allow(unused)] #[allow(unused)]
pub fn with_chunk_overlap_header(mut self, overlap_header: &str) -> Self { pub fn with_chunk_overlap_header(mut self, overlap_header: &str) -> Self {
self.chunk_overlap_header = overlap_header.to_string(); self.chunk_overlap_header = Some(overlap_header.to_string());
self
}
// Set the value of append_chunk_overlap_header
#[allow(unused)]
pub fn with_append_chunk_overlap_header(mut self, value: bool) -> Self {
self.append_chunk_overlap_header = value;
self self
} }
} }
@ -414,7 +404,7 @@ mod tests {
let splitter = RecursiveCharacterTextSplitter::new(3, 0, &[" "]); let splitter = RecursiveCharacterTextSplitter::new(3, 0, &[" "]);
let chunk_header_options = SplitterChunkHeaderOptions::default() let chunk_header_options = SplitterChunkHeaderOptions::default()
.with_chunk_header("SOURCE NAME: testing\n-----\n") .with_chunk_header("SOURCE NAME: testing\n-----\n")
.with_append_chunk_overlap_header(true); .with_chunk_overlap_header("(cont'd) ");
let mut metadata1 = IndexMap::new(); let mut metadata1 = IndexMap::new();
metadata1.insert("source".into(), "1".into()); metadata1.insert("source".into(), "1".into());
let mut metadata2 = IndexMap::new(); let mut metadata2 = IndexMap::new();