use html5ever::interface::QualName; use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::{format_tendril, TendrilSink}; use html5ever::tree_builder::{Attribute, TreeSink}; use html5ever::{local_name, namespace_url, ns}; use http::{is_valid_url, resolve_url, retrieve_asset}; use regex::Regex; use std::default::Default; use utils::data_to_dataurl; lazy_static! { static ref EMPTY_STRING: String = String::new(); static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); static ref ICON_VALUES: Regex = Regex::new(r"^icon|shortcut icon|mask-icon|apple-touch-icon|fluid-icon$").unwrap(); } const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; const JS_DOM_EVENT_ATTRS: [&str; 21] = [ // Input "onfocus", "onblur", "onselect", "onchange", "onsubmit", "onreset", "onkeydown", "onkeypress", "onkeyup", // Mouse "onmouseover", "onmouseout", "onmousedown", "onmouseup", "onmousemove", // Click "onclick", "ondblclick", // Load "onload", "onunload", "onabort", "onerror", "onresize", ]; fn get_parent_node(node: &Handle) -> Handle { let parent = node.parent.take().clone(); parent.and_then(|node| node.upgrade()).unwrap() } fn get_node_name(node: &Handle) -> String { match &node.data { NodeData::Element { ref name, .. } => name.local.as_ref().to_string(), _ => EMPTY_STRING.clone(), } } pub fn walk_and_embed_assets( url: &str, node: &Handle, opt_no_css: bool, opt_no_js: bool, opt_no_images: bool, opt_user_agent: &str, opt_silent: bool, opt_insecure: bool, opt_no_frames: bool, ) { match node.data { NodeData::Document => { // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( &url, child, opt_no_css, opt_no_js, opt_no_images, opt_user_agent, opt_silent, opt_insecure, opt_no_frames, ); } } NodeData::Element { ref name, ref attrs, .. } => { let attrs_mut = &mut attrs.borrow_mut(); match name.local.as_ref() { "link" => { let mut link_type: &str = ""; for attr in attrs_mut.iter_mut() { if &attr.name.local == "rel" { if is_icon(&attr.value.to_string()) { link_type = "icon"; break; } else if attr.value.to_string() == "stylesheet" { link_type = "stylesheet"; break; } } } if link_type == "icon" { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { if opt_no_images { attr.value.clear(); } else { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let favicon_datauri = retrieve_asset( &href_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(favicon_datauri.as_str()); } } } } else if link_type == "stylesheet" { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { if opt_no_css { attr.value.clear(); } else { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let css_datauri = retrieve_asset( &href_full_url, true, "text/css", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(css_datauri.as_str()); } } } } else { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(&href_full_url.as_str()); } } } } "img" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { let value = attr.value.to_string(); // Ignore images with empty source if value == EMPTY_STRING.clone() { continue; } if opt_no_images { attr.value.clear(); attr.value.push_slice(TRANSPARENT_PIXEL); } else { let src_full_url: String = resolve_url(&url, &value).unwrap_or(EMPTY_STRING.clone()); let img_datauri = retrieve_asset( &src_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(img_datauri.as_str()); } } } } "source" => { for attr in attrs_mut.iter_mut() { let attr_name: &str = &attr.name.local; if attr_name == "src" { let src_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(attr.value.to_string()); attr.value.clear(); attr.value.push_slice(src_full_url.as_str()); } else if attr_name == "srcset" { if get_node_name(&get_parent_node(&node)) == "picture" { if opt_no_images { attr.value.clear(); attr.value.push_slice(TRANSPARENT_PIXEL); } else { let srcset_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let source_datauri = retrieve_asset( &srcset_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(source_datauri.as_str()); } } } } } "a" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { // Don't touch email links or hrefs which begin with a hash sign if attr.value.starts_with('#') || has_protocol(&attr.value) { continue; } let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(href_full_url.as_str()); } } } "script" => { if opt_no_js { // Empty src and inner content of SCRIPT tags for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { attr.value.clear(); } } node.children.borrow_mut().clear(); } else { for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { let src_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); let js_datauri = retrieve_asset( &src_full_url, true, "application/javascript", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(js_datauri.as_str()); } } } } "style" => { if opt_no_css { // Empty inner content of STYLE tags node.children.borrow_mut().clear(); } } "form" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "action" { // Don't modify action that's already a full URL if is_valid_url(&attr.value) { continue; } let href_full_url: String = resolve_url(&url, &attr.value.to_string()) .unwrap_or(EMPTY_STRING.clone()); attr.value.clear(); attr.value.push_slice(href_full_url.as_str()); } } } "iframe" => { if opt_no_frames { // Empty the src attribute for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { attr.value.clear(); } } } else { for attr in attrs_mut.iter_mut() { if &attr.name.local == "src" { let iframe_src = attr.value.to_string(); // Ignore iframes with empty source (they cause infinite loops) if iframe_src == EMPTY_STRING.clone() { continue; } let src_full_url: String = resolve_url(&url, &iframe_src).unwrap_or(EMPTY_STRING.clone()); let iframe_data = retrieve_asset( &src_full_url, false, "text/html", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(EMPTY_STRING.clone()); let dom = html_to_dom(&iframe_data); walk_and_embed_assets( &src_full_url, &dom.document, opt_no_css, opt_no_js, opt_no_images, opt_user_agent, opt_silent, opt_insecure, opt_no_frames, ); let mut buf: Vec = Vec::new(); serialize(&mut buf, &dom.document, SerializeOpts::default()) .unwrap(); let iframe_datauri = data_to_dataurl("text/html", &buf); attr.value.clear(); attr.value.push_slice(iframe_datauri.as_str()); } } } } "video" => { for attr in attrs_mut.iter_mut() { if &attr.name.local == "poster" { let video_poster = attr.value.to_string(); // Ignore posters with empty source if video_poster == EMPTY_STRING.clone() { continue; } if opt_no_images { attr.value.clear(); } else { let poster_full_url: String = resolve_url(&url, &video_poster) .unwrap_or(EMPTY_STRING.clone()); let img_datauri = retrieve_asset( &poster_full_url, true, "", opt_user_agent, opt_silent, opt_insecure, ) .unwrap_or(poster_full_url); attr.value.clear(); attr.value.push_slice(img_datauri.as_str()); } } } } _ => {} } if opt_no_css { // Get rid of style attributes for attr in attrs_mut.iter_mut() { if attr.name.local.to_lowercase() == "style" { attr.value.clear(); } } } if opt_no_js { // Get rid of JS event attributes for attr in attrs_mut.iter_mut() { if JS_DOM_EVENT_ATTRS.contains(&attr.name.local.to_lowercase().as_str()) { attr.value.clear(); } } } // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( &url, child, opt_no_css, opt_no_js, opt_no_images, opt_user_agent, opt_silent, opt_insecure, opt_no_frames, ); } } _ => { // Note: in case of opt_no_js being set to true, there's no need to worry about // getting rid of comments that may contain scripts, e.g.