From 4ba4285b6b0166401a0023ad421a90ba44b73b29 Mon Sep 17 00:00:00 2001 From: Sunshine Date: Fri, 25 Dec 2020 15:55:52 -1000 Subject: [PATCH] add support for embedding video and audio files --- README.md | 2 + src/html.rs | 199 ++++++++++++++++++++++++++++++++++++++++++++-- src/opts.rs | 16 ++-- src/tests/opts.rs | 2 + 4 files changed, 206 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 675b816..3437096 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ The guide can be found [here](docs/containers.md) --------------------------------------------------- ## Options + - `-a`: Exclude audio sources - `-b`: Use custom base URL - `-c`: Exclude CSS - `-e`: Ignore network errors @@ -68,6 +69,7 @@ The guide can be found [here](docs/containers.md) - `-s`: Be quiet - `-t`: Adjust network request timeout - `-u`: Provide custom User-Agent + - `-v`: Exclude videos --------------------------------------------------- diff --git a/src/html.rs b/src/html.rs index 6c6db2d..9eae562 100644 --- a/src/html.rs +++ b/src/html.rs @@ -759,7 +759,7 @@ pub fn walk_and_embed_assets( } } "img" => { - // Find source attribute(s) + // Find src and data-src attribute(s) let img_attr_src_value: Option = get_node_attr(node, "src"); let img_attr_data_src_value: Option = get_node_attr(node, "data-src"); @@ -959,14 +959,101 @@ pub fn walk_and_embed_assets( } } "source" => { + let parent_node = get_parent_node(node); + let parent_node_name: &str = get_node_name(&parent_node).unwrap_or_default(); + if let Some(source_attr_src_value) = get_node_attr(node, "src") { - let src_full_url: String = resolve_url(&url, source_attr_src_value.clone()) - .unwrap_or_else(|_| source_attr_src_value.to_string()); - set_node_attr(node, "src", Some(src_full_url)); + if parent_node_name == "audio" { + if options.no_audio { + set_node_attr(node, "src", None); + } else { + let src_full_url: String = + resolve_url(&url, source_attr_src_value.clone()) + .unwrap_or_else(|_| source_attr_src_value.to_string()); + let src_url_fragment = get_url_fragment(src_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &src_full_url, + options, + depth + 1, + ) { + Ok((src_data, src_final_url, src_media_type)) => { + let src_data_url = data_to_data_url( + &src_media_type, + &src_data, + &src_final_url, + ); + let assembled_url: String = url_with_fragment( + src_data_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + if is_http_url(src_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset + let assembled_url: String = url_with_fragment( + src_full_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } else { + // Exclude non-remote URLs + set_node_attr(node, "src", None); + } + } + } + } + } else if parent_node_name == "video" { + if options.no_video { + set_node_attr(node, "src", None); + } else { + let src_full_url: String = + resolve_url(&url, source_attr_src_value.clone()) + .unwrap_or_else(|_| source_attr_src_value.to_string()); + let src_url_fragment = get_url_fragment(src_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &src_full_url, + options, + depth + 1, + ) { + Ok((src_data, src_final_url, src_media_type)) => { + let src_data_url = data_to_data_url( + &src_media_type, + &src_data, + &src_final_url, + ); + let assembled_url: String = url_with_fragment( + src_data_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + if is_http_url(src_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset + let assembled_url: String = url_with_fragment( + src_full_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } else { + // Exclude non-remote URLs + set_node_attr(node, "src", None); + } + } + } + } + } } if let Some(source_attr_srcset_value) = get_node_attr(node, "srcset") { - if get_node_name(&get_parent_node(&node)) == Some("picture") { + if parent_node_name == "picture" { if options.no_images { set_node_attr(node, "srcset", Some(str!(empty_image!()))); } else { @@ -994,13 +1081,16 @@ pub fn walk_and_embed_assets( set_node_attr(node, "srcset", Some(assembled_url)); } Err(_) => { - // Keep remote reference if unable to retrieve the asset if is_http_url(srcset_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset let assembled_url: String = url_with_fragment( srcset_full_url.as_str(), srcset_url_fragment.as_str(), ); set_node_attr(node, "srcset", Some(assembled_url)); + } else { + // Exclude non-remote URLs + set_node_attr(node, "srcset", None); } } } @@ -1192,7 +1282,99 @@ pub fn walk_and_embed_assets( } } } + "audio" => { + if let Some(audio_attr_src_value) = get_node_attr(node, "src") { + if options.no_audio { + set_node_attr(node, "src", None); + } else { + let src_full_url: String = + resolve_url(&url, audio_attr_src_value.clone()) + .unwrap_or_else(|_| audio_attr_src_value.to_string()); + let src_url_fragment = get_url_fragment(src_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &src_full_url, + options, + depth + 1, + ) { + Ok((src_data, src_final_url, src_media_type)) => { + let src_data_url = data_to_data_url( + &src_media_type, + &src_data, + &src_final_url, + ); + let assembled_url: String = url_with_fragment( + src_data_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + if is_http_url(src_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset + let assembled_url: String = url_with_fragment( + src_full_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } else { + // Exclude non-remote URLs + set_node_attr(node, "src", None); + } + } + } + } + } + } "video" => { + if let Some(video_attr_src_value) = get_node_attr(node, "src") { + if options.no_video { + set_node_attr(node, "src", None); + } else { + let src_full_url: String = + resolve_url(&url, video_attr_src_value.clone()) + .unwrap_or_else(|_| video_attr_src_value.to_string()); + let src_url_fragment = get_url_fragment(src_full_url.clone()); + match retrieve_asset( + cache, + client, + &url, + &src_full_url, + options, + depth + 1, + ) { + Ok((src_data, src_final_url, src_media_type)) => { + let src_data_url = data_to_data_url( + &src_media_type, + &src_data, + &src_final_url, + ); + let assembled_url: String = url_with_fragment( + src_data_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } + Err(_) => { + if is_http_url(src_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset + let assembled_url: String = url_with_fragment( + src_full_url.as_str(), + src_url_fragment.as_str(), + ); + set_node_attr(node, "src", Some(assembled_url)); + } else { + // Exclude non-remote URLs + set_node_attr(node, "src", None); + } + } + } + } + } + + // Embed poster images if let Some(video_attr_poster_value) = get_node_attr(node, "poster") { // Skip posters with empty source if !video_attr_poster_value.is_empty() { @@ -1228,13 +1410,16 @@ pub fn walk_and_embed_assets( set_node_attr(node, "poster", Some(assembled_url)); } Err(_) => { - // Keep remote reference if unable to retrieve the asset if is_http_url(video_poster_full_url.clone()) { + // Keep remote reference if unable to retrieve the asset let assembled_url: String = url_with_fragment( video_poster_full_url.as_str(), video_poster_url_fragment.as_str(), ); set_node_attr(node, "poster", Some(assembled_url)); + } else { + // Get rid of poster attribute if the URL is not remote + set_node_attr(node, "poster", None); } } } diff --git a/src/opts.rs b/src/opts.rs index bfe69d7..cb35539 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -2,6 +2,7 @@ use clap::{App, Arg}; #[derive(Default)] pub struct Options { + pub no_audio: bool, pub base_url: Option, pub no_css: bool, pub ignore_errors: bool, @@ -16,6 +17,7 @@ pub struct Options { pub silent: bool, pub timeout: u64, pub user_agent: String, + pub no_video: bool, pub target: String, } @@ -38,8 +40,8 @@ impl Options { .version(crate_version!()) .author(crate_authors!("\n")) .about(format!("{}\n{}", ASCII, crate_description!()).as_str()) - // .args_from_usage("-a, --no-audio 'Removes audio sources'") - .args_from_usage("-b, --base-url=[http://localhost/] 'Use custom base URL'") + .args_from_usage("-a, --no-audio 'Removes audio sources'") + .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'") .args_from_usage("-c, --no-css 'Removes CSS'") .args_from_usage("-e, --ignore-errors 'Ignore network errors'") .args_from_usage("-f, --no-frames 'Removes frames and iframes'") @@ -49,11 +51,11 @@ impl Options { .args_from_usage("-j, --no-js 'Removes JavaScript'") .args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'") .args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'") - .args_from_usage("-o, --output=[document.html] 'Write output to '") + .args_from_usage("-o, --output=[document.html] 'Writes output to '") .args_from_usage("-s, --silent 'Suppresses verbosity'") - .args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'") - .args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'") - // .args_from_usage("-v, --no-video 'Removes video sources'") + .args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'") + .args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'") + .args_from_usage("-v, --no-video 'Removes video sources'") .arg( Arg::with_name("target") .required(true) @@ -69,6 +71,7 @@ impl Options { .value_of("target") .expect("please set target") .to_string(); + options.no_audio = app.is_present("no-audio"); if let Some(base_url) = app.value_of("base-url") { options.base_url = Some(str!(base_url)); } @@ -92,6 +95,7 @@ impl Options { .value_of("user-agent") .unwrap_or(DEFAULT_USER_AGENT) .to_string(); + options.no_video = app.is_present("no-video"); options } diff --git a/src/tests/opts.rs b/src/tests/opts.rs index 0b6a697..c00f4f3 100644 --- a/src/tests/opts.rs +++ b/src/tests/opts.rs @@ -14,6 +14,7 @@ mod passing { let options: Options = Options::default(); assert_eq!(options.target, str!()); + assert_eq!(options.no_audio, false); assert_eq!(options.no_css, false); assert_eq!(options.no_frames, false); assert_eq!(options.no_fonts, false); @@ -26,5 +27,6 @@ mod passing { assert_eq!(options.silent, false); assert_eq!(options.timeout, 0); assert_eq!(options.user_agent, ""); + assert_eq!(options.no_video, false); } }