mirror of
https://github.com/sigoden/aichat
synced 2024-11-08 13:10:28 +00:00
refactor: rag chunk contains path metadata (#647)
This commit is contained in:
parent
2bc9607b00
commit
9f19108a71
@ -262,8 +262,10 @@ impl Rag {
|
|||||||
);
|
);
|
||||||
let documents = load(&path, &extension)
|
let documents = load(&path, &extension)
|
||||||
.with_context(|| format!("Failed to load file at '{path}'"))?;
|
.with_context(|| format!("Failed to load file at '{path}'"))?;
|
||||||
let documents =
|
let split_options = SplitterChunkHeaderOptions::default().with_chunk_header(&format!(
|
||||||
splitter.split_documents(&documents, &SplitterChunkHeaderOptions::default());
|
"<document_metadata>\npath: {path}\n</document_metadata>\n\n"
|
||||||
|
));
|
||||||
|
let documents = splitter.split_documents(&documents, &split_options);
|
||||||
rag_files.push(RagFile { path, documents });
|
rag_files.push(RagFile { path, documents });
|
||||||
progress(
|
progress(
|
||||||
&progress_tx,
|
&progress_tx,
|
||||||
|
@ -106,7 +106,6 @@ impl RecursiveCharacterTextSplitter {
|
|||||||
let SplitterChunkHeaderOptions {
|
let SplitterChunkHeaderOptions {
|
||||||
chunk_header,
|
chunk_header,
|
||||||
chunk_overlap_header,
|
chunk_overlap_header,
|
||||||
append_chunk_overlap_header,
|
|
||||||
} = chunk_header_options;
|
} = chunk_header_options;
|
||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
@ -144,7 +143,7 @@ impl RecursiveCharacterTextSplitter {
|
|||||||
Ordering::Equal => {}
|
Ordering::Equal => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
if *append_chunk_overlap_header {
|
if let Some(chunk_overlap_header) = chunk_overlap_header {
|
||||||
page_content += chunk_overlap_header;
|
page_content += chunk_overlap_header;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -288,16 +287,14 @@ impl RecursiveCharacterTextSplitter {
|
|||||||
|
|
||||||
pub struct SplitterChunkHeaderOptions {
|
pub struct SplitterChunkHeaderOptions {
|
||||||
pub chunk_header: String,
|
pub chunk_header: String,
|
||||||
pub chunk_overlap_header: String,
|
pub chunk_overlap_header: Option<String>,
|
||||||
pub append_chunk_overlap_header: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SplitterChunkHeaderOptions {
|
impl Default for SplitterChunkHeaderOptions {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
chunk_header: "".into(),
|
chunk_header: "".into(),
|
||||||
chunk_overlap_header: "(cont'd) ".into(),
|
chunk_overlap_header: None,
|
||||||
append_chunk_overlap_header: false,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -313,14 +310,7 @@ impl SplitterChunkHeaderOptions {
|
|||||||
// Set the value of chunk_overlap_header
|
// Set the value of chunk_overlap_header
|
||||||
#[allow(unused)]
|
#[allow(unused)]
|
||||||
pub fn with_chunk_overlap_header(mut self, overlap_header: &str) -> Self {
|
pub fn with_chunk_overlap_header(mut self, overlap_header: &str) -> Self {
|
||||||
self.chunk_overlap_header = overlap_header.to_string();
|
self.chunk_overlap_header = Some(overlap_header.to_string());
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the value of append_chunk_overlap_header
|
|
||||||
#[allow(unused)]
|
|
||||||
pub fn with_append_chunk_overlap_header(mut self, value: bool) -> Self {
|
|
||||||
self.append_chunk_overlap_header = value;
|
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -414,7 +404,7 @@ mod tests {
|
|||||||
let splitter = RecursiveCharacterTextSplitter::new(3, 0, &[" "]);
|
let splitter = RecursiveCharacterTextSplitter::new(3, 0, &[" "]);
|
||||||
let chunk_header_options = SplitterChunkHeaderOptions::default()
|
let chunk_header_options = SplitterChunkHeaderOptions::default()
|
||||||
.with_chunk_header("SOURCE NAME: testing\n-----\n")
|
.with_chunk_header("SOURCE NAME: testing\n-----\n")
|
||||||
.with_append_chunk_overlap_header(true);
|
.with_chunk_overlap_header("(cont'd) ");
|
||||||
let mut metadata1 = IndexMap::new();
|
let mut metadata1 = IndexMap::new();
|
||||||
metadata1.insert("source".into(), "1".into());
|
metadata1.insert("source".into(), "1".into());
|
||||||
let mut metadata2 = IndexMap::new();
|
let mut metadata2 = IndexMap::new();
|
||||||
|
Loading…
Reference in New Issue
Block a user