use html5ever::parse_document; use html5ever::rcdom::{Handle, NodeData, RcDom}; use html5ever::serialize::{serialize, SerializeOpts}; use html5ever::tendril::TendrilSink; use http::{is_valid_url, resolve_url, retrieve_asset}; use regex::Regex; use std::default::Default; use std::io; use utils::data_to_dataurl; lazy_static! { static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); } enum NodeMatch { Icon, Image, Source, StyleSheet, Anchor, Script, Form, IFrame, Other, } const TRANSPARENT_PIXEL: &str = "data:image/png;base64,\ iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII="; const JS_DOM_EVENT_ATTRS: [&str; 21] = [ // Input "onfocus", "onblur", "onselect", "onchange", "onsubmit", "onreset", "onkeydown", "onkeypress", "onkeyup", // Mouse "onmouseover", "onmouseout", "onmousedown", "onmouseup", "onmousemove", // Click "onclick", "ondblclick", // Load "onload", "onunload", "onabort", "onerror", "onresize", ]; fn get_parent_node_name(node: &Handle) -> String { let parent = node.parent.take().clone(); let parent_node = parent.and_then(|node| node.upgrade()).unwrap(); match &parent_node.data { NodeData::Document => {"".to_string()} NodeData::Doctype { .. } => {"".to_string()} NodeData::Text { .. } => {"".to_string()} NodeData::Comment { .. } => {"".to_string()} NodeData::Element { ref name, attrs: _, .. } => { name.local.as_ref().to_string() } NodeData::ProcessingInstruction { .. } => unreachable!() } } pub fn walk_and_embed_assets( url: &str, node: &Handle, opt_no_js: bool, opt_no_images: bool, opt_user_agent: &str, ) { match node.data { NodeData::Document => { // Dig deeper for child in node.children.borrow().iter() { walk_and_embed_assets( &url, child, opt_no_js, opt_no_images, opt_user_agent, ); } } NodeData::Doctype { .. } => {} NodeData::Text { .. } => {} NodeData::Comment { .. } => { // Note: in case of opt_no_js being set to true, there's no need to worry about // getting rid of comments that may contain scripts, e.g.