refactor: fetch image as data url (#694)

pull/695/head
sigoden 3 months ago committed by GitHub
parent 0afe5fa24b
commit 48219d2808
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -379,13 +379,14 @@ async fn load_paths(
} }
} }
for file_url in remote_urls { for file_url in remote_urls {
if is_image(&file_url) { let (contents, extension) = fetch(&loaders, &file_url, true)
medias.push(file_url) .await
.with_context(|| format!("Failed to load url '{file_url}'"))?;
if extension == MEDIA_URL_EXTENSION {
data_urls.insert(sha256(&contents), file_url);
medias.push(contents)
} else { } else {
let (text, _) = fetch(&loaders, &file_url) files.push((file_url, contents));
.await
.with_context(|| format!("Failed to load url '{file_url}'"))?;
files.push((file_url, text));
} }
} }
Ok((files, medias, data_urls)) Ok((files, medias, data_urls))
@ -416,22 +417,19 @@ fn resolve_local_path(path: &str) -> Option<String> {
} }
fn is_image(path: &str) -> bool { fn is_image(path: &str) -> bool {
path_extension(path) get_patch_extension(path)
.map(|v| IMAGE_EXTS.contains(&v.as_str())) .map(|v| IMAGE_EXTS.contains(&v.as_str()))
.unwrap_or_default() .unwrap_or_default()
} }
fn read_media_to_data_url<P: AsRef<Path>>(image_path: P) -> Result<String> { fn read_media_to_data_url(image_path: &str) -> Result<String> {
let image_path = image_path.as_ref(); let extension = get_patch_extension(image_path).unwrap_or_default();
let mime_type = match image_path.extension().and_then(|v| v.to_str()) { let mime_type = match extension.as_str() {
Some(extension) => match extension { "png" => "image/png",
"png" => "image/png", "jpg" | "jpeg" => "image/jpeg",
"jpg" | "jpeg" => "image/jpeg", "webp" => "image/webp",
"webp" => "image/webp", "gif" => "image/gif",
"gif" => "image/gif", _ => bail!("Unexpected media type"),
_ => bail!("Unsupported media type"),
},
None => bail!("Unknown media type"),
}; };
let mut file = File::open(image_path)?; let mut file = File::open(image_path)?;
let mut buffer = Vec::new(); let mut buffer = Vec::new();

@ -60,7 +60,7 @@ pub async fn load_file(
loaders: &HashMap<String, String>, loaders: &HashMap<String, String>,
path: &str, path: &str,
) -> Result<(String, RagMetadata)> { ) -> Result<(String, RagMetadata)> {
let extension = path_extension(path).unwrap_or_else(|| DEFAULT_EXTENSION.into()); let extension = get_patch_extension(path).unwrap_or_else(|| DEFAULT_EXTENSION.into());
match loaders.get(&extension) { match loaders.get(&extension) {
Some(loader_command) => load_with_command(path, &extension, loader_command), Some(loader_command) => load_with_command(path, &extension, loader_command),
None => load_plain(path, &extension).await, None => load_plain(path, &extension).await,
@ -71,7 +71,7 @@ pub async fn load_url(
loaders: &HashMap<String, String>, loaders: &HashMap<String, String>,
path: &str, path: &str,
) -> Result<(String, RagMetadata)> { ) -> Result<(String, RagMetadata)> {
let (contents, extension) = fetch(loaders, path).await?; let (contents, extension) = fetch(loaders, path, false).await?;
let mut metadata: RagMetadata = Default::default(); let mut metadata: RagMetadata = Default::default();
metadata.insert(PATH_METADATA.into(), path.into()); metadata.insert(PATH_METADATA.into(), path.into());
metadata.insert(EXTENSION_METADATA.into(), extension); metadata.insert(EXTENSION_METADATA.into(), extension);

@ -42,7 +42,7 @@ pub async fn expand_glob_paths<T: AsRef<str>>(paths: &[T]) -> Result<Vec<String>
Ok(new_paths) Ok(new_paths)
} }
pub fn path_extension(path: &str) -> Option<String> { pub fn get_patch_extension(path: &str) -> Option<String> {
Path::new(&path) Path::new(&path)
.extension() .extension()
.map(|v| v.to_string_lossy().to_lowercase()) .map(|v| v.to_string_lossy().to_lowercase())

@ -8,6 +8,7 @@ use tokio::io::AsyncWriteExt;
pub const URL_LOADER: &str = "url"; pub const URL_LOADER: &str = "url";
pub const RECURSIVE_URL_LOADER: &str = "recursive_url"; pub const RECURSIVE_URL_LOADER: &str = "recursive_url";
pub const MEDIA_URL_EXTENSION: &str = "media_url";
pub const DEFAULT_EXTENSION: &str = "txt"; pub const DEFAULT_EXTENSION: &str = "txt";
lazy_static! { lazy_static! {
@ -19,7 +20,11 @@ lazy_static! {
}; };
} }
pub async fn fetch(loaders: &HashMap<String, String>, path: &str) -> Result<(String, String)> { pub async fn fetch(
loaders: &HashMap<String, String>,
path: &str,
allow_media: bool,
) -> Result<(String, String)> {
if let Some(loader_command) = loaders.get(URL_LOADER) { if let Some(loader_command) = loaders.get(URL_LOADER) {
let contents = run_loader_command(path, URL_LOADER, loader_command)?; let contents = run_loader_command(path, URL_LOADER, loader_command)?;
return Ok((contents, DEFAULT_EXTENSION.into())); return Ok((contents, DEFAULT_EXTENSION.into()));
@ -29,6 +34,9 @@ pub async fn fetch(loaders: &HashMap<String, String>, path: &str) -> Result<(Str
Err(ref err) => bail!("{err}"), Err(ref err) => bail!("{err}"),
}; };
let mut res = client.get(path).send().await?; let mut res = client.get(path).send().await?;
if !res.status().is_success() {
bail!("Invalid status: {}", res.status());
}
let content_type = res let content_type = res
.headers() .headers()
.get(CONTENT_TYPE) .get(CONTENT_TYPE)
@ -37,45 +45,71 @@ pub async fn fetch(loaders: &HashMap<String, String>, path: &str) -> Result<(Str
Some((mime, _)) => mime.trim(), Some((mime, _)) => mime.trim(),
None => v, None => v,
}) })
.unwrap_or_default(); .map(|v| v.to_string())
let extension = match content_type { .unwrap_or_else(|| {
"application/pdf" => "pdf", format!(
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx", "_/{}",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx", get_patch_extension(path).unwrap_or_else(|| DEFAULT_EXTENSION.into())
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => "pptx", )
"application/vnd.oasis.opendocument.text" => "odt", });
"application/vnd.oasis.opendocument.spreadsheet" => "ods", let mut is_media = false;
"application/vnd.oasis.opendocument.presentation" => "odp", let extension = match content_type.as_str() {
"application/rtf" => "rtf", "application/pdf" => "pdf".into(),
"text/html" => "html", "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => "docx".into(),
_ => path "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" => "xlsx".into(),
"application/vnd.openxmlformats-officedocument.presentationml.presentation" => {
"pptx".into()
}
"application/vnd.oasis.opendocument.text" => "odt".into(),
"application/vnd.oasis.opendocument.spreadsheet" => "ods".into(),
"application/vnd.oasis.opendocument.presentation" => "odp".into(),
"application/rtf" => "rtf".into(),
"text/javascript" => "js".into(),
"text/html" => "html".into(),
_ => content_type
.rsplit_once('/') .rsplit_once('/')
.and_then(|(_, pair)| pair.rsplit_once('.').map(|(_, ext)| ext)) .map(|(first, last)| {
.unwrap_or(DEFAULT_EXTENSION), if ["image", "video", "audio"].contains(&first) {
is_media = true;
MEDIA_URL_EXTENSION.into()
} else {
last.to_lowercase()
}
})
.unwrap_or_else(|| DEFAULT_EXTENSION.into()),
}; };
let extension = extension.to_lowercase(); let result = if is_media {
let result = match loaders.get(&extension) { if !allow_media {
Some(loader_command) => { bail!("Unexpected media type")
let save_path = temp_file("-download-", &format!(".{extension}"))
.display()
.to_string();
let mut save_file = tokio::fs::File::create(&save_path).await?;
let mut size = 0;
while let Some(chunk) = res.chunk().await? {
size += chunk.len();
save_file.write_all(&chunk).await?;
}
let contents = if size == 0 {
println!("{}", warning_text(&format!("No content at '{path}'")));
String::new()
} else {
run_loader_command(&save_path, &extension, loader_command)?
};
(contents, DEFAULT_EXTENSION.into())
} }
None => { let image_bytes = res.bytes().await?;
let contents = res.text().await?; let image_base64 = base64_encode(&image_bytes);
(contents, extension) let contents = format!("data:{};base64,{}", content_type, image_base64);
(contents, extension)
} else {
match loaders.get(&extension) {
Some(loader_command) => {
let save_path = temp_file("-download-", &format!(".{extension}"))
.display()
.to_string();
let mut save_file = tokio::fs::File::create(&save_path).await?;
let mut size = 0;
while let Some(chunk) = res.chunk().await? {
size += chunk.len();
save_file.write_all(&chunk).await?;
}
let contents = if size == 0 {
println!("{}", warning_text(&format!("No content at '{path}'")));
String::new()
} else {
run_loader_command(&save_path, &extension, loader_command)?
};
(contents, DEFAULT_EXTENSION.into())
}
None => {
let contents = res.text().await?;
(contents, extension)
}
} }
}; };
Ok(result) Ok(result)

Loading…
Cancel
Save