diff --git a/.appveyor.yml b/.appveyor.yml index 12c88c3..5a17430 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -52,11 +52,11 @@ environment: # Nightly 64-bit MSVC - channel: nightly target: x86_64-pc-windows-msvc - #cargoflags: --features "unstable" + cargoflags: --features "unstable" # Nightly 32-bit MSVC - channel: nightly target: i686-pc-windows-msvc - #cargoflags: --features "unstable" + cargoflags: --features "unstable" ### GNU Toolchains ### @@ -80,12 +80,12 @@ environment: - channel: nightly target: x86_64-pc-windows-gnu MINGW_PATH: 'C:\mingw-w64\x86_64-6.3.0-posix-seh-rt_v5-rev1\mingw64\bin' - #cargoflags: --features "unstable" + cargoflags: --features "unstable" # Nightly 32-bit GNU - channel: nightly target: i686-pc-windows-gnu MINGW_PATH: 'C:\MinGW\bin' - #cargoflags: --features "unstable" + cargoflags: --features "unstable" ### Allowed failures ### @@ -124,7 +124,8 @@ install: build: false # Uses 'cargo test' to run tests and build. Alternatively, the project may call compiled programs -#directly or perform other testing commands. Rust will automatically be placed in the PATH +# directly or perform other testing commands. Rust will automatically be placed in the PATH # environment variable. test_script: - - cargo test --verbose %cargoflags% + - cargo build --all --locked --verbose %cargoflags% + - cargo test --all --locked --verbose %cargoflags% diff --git a/Cargo.lock b/Cargo.lock index ec34cb7..92686ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,6 +26,18 @@ name = "anyhow" version = "1.0.26" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "assert_cmd" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "doc-comment 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)", + "escargot 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)", + "predicates 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "predicates-core 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "predicates-tree 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "async-compression" version = "0.2.0" @@ -134,6 +146,16 @@ dependencies = [ "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "difference" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "doc-comment" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "dtoa" version = "0.4.4" @@ -147,6 +169,17 @@ dependencies = [ "cfg-if 0.1.10 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "escargot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "lazy_static 1.4.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "flate2" version = "1.0.13" @@ -512,6 +545,7 @@ dependencies = [ name = "monolith" version = "2.1.2" dependencies = [ + "assert_cmd 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)", "base64 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)", "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", "html5ever 0.24.1 (registry+https://github.com/rust-lang/crates.io-index)", @@ -683,6 +717,29 @@ name = "precomputed-hash" version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "predicates" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "predicates-core 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "predicates-core" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "predicates-tree" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "predicates-core 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", + "treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "proc-macro2" version = "1.0.6" @@ -941,6 +998,9 @@ dependencies = [ name = "serde" version = "1.0.103" source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde_derive 1.0.103 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "serde_derive" @@ -1131,6 +1191,11 @@ name = "tower-service" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "treeline" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "try-lock" version = "0.2.2" @@ -1372,6 +1437,7 @@ dependencies = [ "checksum aho-corasick 0.7.6 (registry+https://github.com/rust-lang/crates.io-index)" = "58fb5e95d83b38284460a5fda7d6470aa0b8844d283a0b614b8535e880800d2d" "checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum anyhow 1.0.26 (registry+https://github.com/rust-lang/crates.io-index)" = "7825f6833612eb2414095684fcf6c635becf3ce97fe48cf6421321e93bfbd53c" +"checksum assert_cmd 0.12.0 (registry+https://github.com/rust-lang/crates.io-index)" = "6283bac8dd7226470d491bc4737816fea4ca1fba7a2847f2e9097fd6bfb4624c" "checksum async-compression 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "2c5c52622726d68ec35fec88edfb4ccb862d4f3b3bfa4af2f45142e69ef9b220" "checksum atty 0.2.13 (registry+https://github.com/rust-lang/crates.io-index)" = "1803c647a3ec87095e7ae7acfca019e98de5ec9a7d01343f611cf3152ed71a90" "checksum autocfg 0.1.7 (registry+https://github.com/rust-lang/crates.io-index)" = "1d49d90015b3c36167a20fe2810c5cd875ad504b39cff3d4eae7977e6b7c1cb2" @@ -1387,8 +1453,11 @@ dependencies = [ "checksum core-foundation 0.6.4 (registry+https://github.com/rust-lang/crates.io-index)" = "25b9e03f145fd4f2bf705e07b900cd41fc636598fe5dc452fd0db1441c3f496d" "checksum core-foundation-sys 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e7ca8a5221364ef15ce201e8ed2f609fc312682a8f4e0e3d4aa5879764e0fa3b" "checksum crc32fast 1.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ba125de2af0df55319f41944744ad91c71113bf74a4646efff39afe1f6842db1" +"checksum difference 2.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "524cbf6897b527295dff137cec09ecf3a05f4fddffd7dfcd1585403449e74198" +"checksum doc-comment 0.3.1 (registry+https://github.com/rust-lang/crates.io-index)" = "923dea538cea0aa3025e8685b20d6ee21ef99c4f77e954a30febbaac5ec73a97" "checksum dtoa 0.4.4 (registry+https://github.com/rust-lang/crates.io-index)" = "ea57b42383d091c85abcc2706240b94ab2a8fa1fc81c10ff23c4de06e2a90b5e" "checksum encoding_rs 0.8.20 (registry+https://github.com/rust-lang/crates.io-index)" = "87240518927716f79692c2ed85bfe6e98196d18c6401ec75355760233a7e12e9" +"checksum escargot 0.5.0 (registry+https://github.com/rust-lang/crates.io-index)" = "74cf96bec282dcdb07099f7e31d9fed323bca9435a09aba7b6d99b7617bca96d" "checksum flate2 1.0.13 (registry+https://github.com/rust-lang/crates.io-index)" = "6bd6d6f4752952feb71363cffc9ebac9411b75b87c6ab6058c40c8900cf43c0f" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum foreign-types 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" @@ -1451,6 +1520,9 @@ dependencies = [ "checksum pkg-config 0.3.17 (registry+https://github.com/rust-lang/crates.io-index)" = "05da548ad6865900e60eaba7f589cc0783590a92e940c26953ff81ddbab2d677" "checksum ppv-lite86 0.2.6 (registry+https://github.com/rust-lang/crates.io-index)" = "74490b50b9fbe561ac330df47c08f3f33073d2d00c150f719147d7c54522fa1b" "checksum precomputed-hash 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" +"checksum predicates 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "a9bfe52247e5cc9b2f943682a85a5549fb9662245caf094504e69a2f03fe64d4" +"checksum predicates-core 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "06075c3a3e92559ff8929e7a280684489ea27fe44805174c3ebd9328dcb37178" +"checksum predicates-tree 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "8e63c4859013b38a76eca2414c64911fba30def9e3202ac461a2d22831220124" "checksum proc-macro2 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "9c9e470a8dc4aeae2dee2f335e8f533e2d4b347e1434e5671afc49b054592f27" "checksum quote 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "053a8c8bcc71fcce321828dc897a98ab9760bef03a4fc36693c231e5b3216cfe" "checksum rand 0.6.5 (registry+https://github.com/rust-lang/crates.io-index)" = "6d71dacdc3c88c1fde3885a3be3fbab9f35724e6ce99467f7d9c5026132184ca" @@ -1499,6 +1571,7 @@ dependencies = [ "checksum tokio-tls 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "7bde02a3a5291395f59b06ec6945a3077602fac2b07eeeaf0dee2122f3619828" "checksum tokio-util 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "571da51182ec208780505a32528fc5512a8fe1443ab960b3f2f3ef093cd16930" "checksum tower-service 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e987b6bf443f4b5b3b6f38704195592cca41c5bb7aedd3c3693c7081f8289860" +"checksum treeline 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a7f741b240f1a48843f9b8e0444fb55fb2a4ff67293b50a9179dfd5ea67f8d41" "checksum try-lock 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e604eb7b43c06650e854be16a2a03155743d3752dd1c943f6829e26b7a36e382" "checksum unicase 2.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6" "checksum unicode-bidi 0.3.4 (registry+https://github.com/rust-lang/crates.io-index)" = "49f2bd0c6468a8230e1db229cff8029217cf623c767ea5d60bfbd42729ea54d5" diff --git a/Cargo.toml b/Cargo.toml index 4cf1fd1..5c6aa20 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,6 @@ url = "2.1.1" version = "0.10.*" default-features = false features = ["default-tls", "blocking", "gzip"] + +[dev-dependencies] +assert_cmd = "0.12.0" diff --git a/Makefile b/Makefile index 5964e1a..3117e09 100644 --- a/Makefile +++ b/Makefile @@ -1,16 +1,21 @@ -.PHONY: all build install run test lint +#!/usr/bin/make -f -all: test build +all: test +.PHONY: all build: @cargo build --locked +.PHONY: build install: @cargo install --force --locked --path . +.PHONY: install -test: +test: build @cargo test --locked @cargo fmt --all -- --check +.PHONY: test lint: @cargo fmt --all -- +.PHONY: lint diff --git a/src/args.rs b/src/args.rs index ac0c618..ca72efc 100644 --- a/src/args.rs +++ b/src/args.rs @@ -21,7 +21,7 @@ const DEFAULT_USER_AGENT: &str = impl AppArgs { pub fn get() -> AppArgs { - let app = App::new("monolith") + let app = App::new(env!("CARGO_PKG_NAME")) .version(crate_version!()) .author(crate_authors!("\n")) .about(crate_description!()) diff --git a/src/html.rs b/src/html.rs index 7f9c91f..e6ded28 100644 --- a/src/html.rs +++ b/src/html.rs @@ -1,7 +1,7 @@ use crate::http::retrieve_asset; use crate::js::attr_is_event_handler; use crate::utils::{ - data_to_dataurl, is_valid_url, resolve_css_imports, resolve_url, url_has_protocol, + data_to_data_url, is_http_url, resolve_css_imports, resolve_url, url_has_protocol, }; use html5ever::interface::QualName; use html5ever::parse_document; @@ -130,7 +130,7 @@ pub fn walk_and_embed_assets( } else { let href_full_url = resolve_url(&url, attr.value.as_ref()) .unwrap_or_default(); - let (favicon_dataurl, _) = retrieve_asset( + let (favicon_data_url, _) = retrieve_asset( cache, client, &href_full_url, @@ -140,7 +140,7 @@ pub fn walk_and_embed_assets( ) .unwrap_or_default(); attr.value.clear(); - attr.value.push_slice(favicon_dataurl.as_str()); + attr.value.push_slice(favicon_data_url.as_str()); } } } @@ -229,14 +229,14 @@ pub fn walk_and_embed_assets( name: QualName::new(None, ns!(), local_name!("src")), value: Tendril::from_slice(TRANSPARENT_PIXEL), }); - } else if let Some((dataurl, _)) = found_datasrc + } else if let Some((data_url, _)) = found_datasrc .iter() - .chain(&found_src) // Give dataurl priority + .chain(&found_src) // Give data_url priority .map(|attr| attr.value.trim()) .filter(|src| !src.is_empty()) // Ignore empty srcs .next() .and_then(|src| resolve_url(&url, src).ok()) // Make absolute - .and_then(|abs_src| // Download and convert to dataurl + .and_then(|abs_src| // Download and convert to data_url retrieve_asset( cache, client, @@ -246,10 +246,10 @@ pub fn walk_and_embed_assets( opt_silent, ).ok()) { - // Add the new dataurl src attribute + // Add the new data_url src attribute attrs_mut.push(Attribute { name: QualName::new(None, ns!(), local_name!("src")), - value: Tendril::from_slice(dataurl.as_ref()), + value: Tendril::from_slice(data_url.as_ref()), }); } } @@ -270,7 +270,7 @@ pub fn walk_and_embed_assets( } else { let srcset_full_url = resolve_url(&url, attr.value.trim()).unwrap_or_default(); - let (source_dataurl, _) = retrieve_asset( + let (source_data_url, _) = retrieve_asset( cache, client, &srcset_full_url, @@ -280,7 +280,7 @@ pub fn walk_and_embed_assets( ) .unwrap_or((str!(), str!())); attr.value.clear(); - attr.value.push_slice(source_dataurl.as_str()); + attr.value.push_slice(source_data_url.as_str()); } } } @@ -334,7 +334,7 @@ pub fn walk_and_embed_assets( if &attr.name.local == "src" { let src_full_url = resolve_url(&url, attr.value.trim()).unwrap_or_default(); - let (js_dataurl, _) = retrieve_asset( + let (js_data_url, _) = retrieve_asset( cache, client, &src_full_url, @@ -344,7 +344,7 @@ pub fn walk_and_embed_assets( ) .unwrap_or((str!(), str!())); attr.value.clear(); - attr.value.push_slice(js_dataurl.as_str()); + attr.value.push_slice(js_data_url.as_str()); } } } @@ -377,7 +377,7 @@ pub fn walk_and_embed_assets( if &attr.name.local == "action" { let attr_value = attr.value.trim(); // Modify action to be a full URL - if !is_valid_url(attr_value) { + if !is_http_url(attr_value) { let href_full_url = resolve_url(&url, attr_value).unwrap_or_default(); attr.value.clear(); @@ -426,9 +426,9 @@ pub fn walk_and_embed_assets( ); let mut buf: Vec = Vec::new(); serialize(&mut buf, &dom.document, SerializeOpts::default()).unwrap(); - let iframe_dataurl = data_to_dataurl("text/html", &buf); + let iframe_data_url = data_to_data_url("text/html", &buf); attr.value.clear(); - attr.value.push_slice(iframe_dataurl.as_str()); + attr.value.push_slice(iframe_data_url.as_str()); } } } @@ -447,7 +447,7 @@ pub fn walk_and_embed_assets( } else { let poster_full_url = resolve_url(&url, video_poster).unwrap_or_default(); - let (poster_dataurl, _) = retrieve_asset( + let (poster_data_url, _) = retrieve_asset( cache, client, &poster_full_url, @@ -457,7 +457,7 @@ pub fn walk_and_embed_assets( ) .unwrap_or((poster_full_url, str!())); attr.value.clear(); - attr.value.push_slice(poster_dataurl.as_str()); + attr.value.push_slice(poster_data_url.as_str()); } } } diff --git a/src/http.rs b/src/http.rs index ef1b843..d458c34 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,4 +1,4 @@ -use crate::utils::{clean_url, data_to_dataurl, is_data_url}; +use crate::utils::{clean_url, data_to_data_url, is_data_url}; use reqwest::blocking::Client; use reqwest::header::CONTENT_TYPE; use std::collections::HashMap; @@ -7,13 +7,13 @@ pub fn retrieve_asset( cache: &mut HashMap, client: &Client, url: &str, - as_dataurl: bool, + as_data_url: bool, mime: &str, opt_silent: bool, ) -> Result<(String, String), reqwest::Error> { let cache_key = clean_url(&url); - if is_data_url(&url).unwrap() { + if is_data_url(&url) { Ok((url.to_string(), url.to_string())) } else { if cache.contains_key(&cache_key) { @@ -38,7 +38,7 @@ pub fn retrieve_asset( let new_cache_key = clean_url(&res_url); - if as_dataurl { + if as_data_url { // Convert response into a byte array let mut data: Vec = vec![]; response.copy_to(&mut data)?; @@ -53,10 +53,10 @@ pub fn retrieve_asset( } else { mime }; - let dataurl = data_to_dataurl(&mimetype, &data); + let data_url = data_to_data_url(&mimetype, &data); // insert in cache - cache.insert(new_cache_key, dataurl.clone()); - Ok((dataurl, res_url)) + cache.insert(new_cache_key, data_url.clone()); + Ok((data_url, res_url)) } else { let content = response.text().unwrap(); // insert in cache diff --git a/src/main.rs b/src/main.rs index 183545f..83fc37e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -7,7 +7,7 @@ mod macros; use crate::args::AppArgs; use monolith::html::{html_to_dom, stringify_document, walk_and_embed_assets}; use monolith::http::retrieve_asset; -use monolith::utils::is_valid_url; +use monolith::utils::{data_url_to_text, is_data_url, is_http_url}; use reqwest::blocking::Client; use reqwest::header::{HeaderMap, HeaderValue, USER_AGENT}; use std::collections::HashMap; @@ -46,11 +46,14 @@ impl Output { fn main() { let app_args = AppArgs::get(); + let target_url: &str = app_args.url_target.as_str(); + let base_url; + let dom; - if !is_valid_url(app_args.url_target.as_str()) { + if !is_http_url(target_url) && !is_data_url(target_url) { eprintln!( - "Only HTTP and HTTPS URLs are allowed but got: {}", - &app_args.url_target + "Only HTTP(S) or data URLs are supported but got: {}", + &target_url ); process::exit(1); } @@ -78,21 +81,30 @@ fn main() { .expect("Failed to initialize HTTP client"); // Retrieve root document - let (data, final_url) = retrieve_asset( - &mut cache, - &client, - app_args.url_target.as_str(), - false, - "", - app_args.silent, - ) - .expect("Could not retrieve assets in HTML"); - let dom = html_to_dom(&data); + if is_http_url(target_url) { + let (data, final_url) = + retrieve_asset(&mut cache, &client, target_url, false, "", app_args.silent) + .expect("Could not retrieve assets in HTML"); + base_url = final_url; + dom = html_to_dom(&data); + } else if is_data_url(target_url) { + let text: String = data_url_to_text(target_url); + + if text.len() == 0 { + eprintln!("Unsupported data URL input"); + process::exit(1); + } + + base_url = str!(); + dom = html_to_dom(&text); + } else { + process::exit(1); + } walk_and_embed_assets( &mut cache, &client, - &final_url, + &base_url, &dom.document, app_args.no_css, app_args.no_js, diff --git a/src/tests/cli.rs b/src/tests/cli.rs new file mode 100644 index 0000000..9a6aa82 --- /dev/null +++ b/src/tests/cli.rs @@ -0,0 +1,182 @@ +use assert_cmd::prelude::*; +use std::process::Command; + +#[test] +fn print_version() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd.arg("-V").output().unwrap(); + + // STDOUT should contain program name and version + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + format!("{} {}\n", env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION")) + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn bad_input() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd.arg("kernel.org").output().unwrap(); + + // STDOUT should be empty + assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); + + // STDERR should contain error description + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + "Only HTTP(S) or data URLs are supported but got: kernel.org\n" + ); + + // The exit code should be 1 + out.assert().code(1); + + Ok(()) +} + +#[test] +fn bad_input_data_url() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd.arg("data:,Hello%2C%20World!").output().unwrap(); + + // STDOUT should contain HTML + assert_eq!(std::str::from_utf8(&out.stdout).unwrap(), ""); + + // STDERR should contain error description + assert_eq!( + std::str::from_utf8(&out.stderr).unwrap(), + "Unsupported data URL input\n" + ); + + // The exit code should be 1 + out.assert().code(1); + + Ok(()) +} + +#[test] +fn isolate_data_url() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-I") + .arg("data:text/html,Hello%2C%20World!") + .output() + .unwrap(); + + // STDOUT should contain isolated HTML + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "Hello, World!\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn remove_css_from_data_url() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-c") + .arg("data:text/html,Hello") + .output() + .unwrap(); + + // STDOUT should contain HTML with no CSS + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "Hello\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn remove_frames_from_data_url() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-f") + .arg("data:text/html,Hi") + .output() + .unwrap(); + + // STDOUT should contain HTML with no iframes + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "Hi\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn remove_images_from_data_url() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-i") + .arg("data:text/html,Hi") + .output() + .unwrap(); + + // STDOUT should contain HTML with no images + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "Hi\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} + +#[test] +fn remove_js_from_data_url() -> Result<(), Box> { + let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME"))?; + let out = cmd + .arg("-j") + .arg("data:text/html,Hi") + .output() + .unwrap(); + + // STDOUT should contain HTML with no JS + assert_eq!( + std::str::from_utf8(&out.stdout).unwrap(), + "Hi\n" + ); + + // STDERR should be empty + assert_eq!(std::str::from_utf8(&out.stderr).unwrap(), ""); + + // The exit code should be 0 + out.assert().code(0); + + Ok(()) +} diff --git a/src/tests/html.rs b/src/tests/html.rs index bb554c9..6ea6d76 100644 --- a/src/tests/html.rs +++ b/src/tests/html.rs @@ -503,7 +503,7 @@ fn test_stringify_document_isolate_no_frames_no_js_no_css_no_images() { "\ \ \ - \ + \ no-frame no-css no-js no-image isolated document\ \ \ diff --git a/src/tests/js.rs b/src/tests/js.rs index 30ebda8..c25d05f 100644 --- a/src/tests/js.rs +++ b/src/tests/js.rs @@ -2,7 +2,7 @@ use crate::js::attr_is_event_handler; #[test] fn test_attr_is_event_handler() { - // succeeding + // passing assert!(attr_is_event_handler("onBlur")); assert!(attr_is_event_handler("onclick")); assert!(attr_is_event_handler("onClick")); diff --git a/src/tests/mod.rs b/src/tests/mod.rs index a77b631..0051cfc 100644 --- a/src/tests/mod.rs +++ b/src/tests/mod.rs @@ -1,3 +1,4 @@ +mod cli; mod html; mod http; mod js; diff --git a/src/tests/utils.rs b/src/tests/utils.rs index a49f7cb..f1169c6 100644 --- a/src/tests/utils.rs +++ b/src/tests/utils.rs @@ -1,14 +1,14 @@ use crate::utils::{ - clean_url, data_to_dataurl, detect_mimetype, is_data_url, is_valid_url, resolve_url, - url_has_protocol, + clean_url, data_to_data_url, data_url_to_text, detect_mimetype, is_data_url, is_http_url, + resolve_url, url_has_protocol, }; use url::ParseError; #[test] -fn test_data_to_dataurl() { +fn test_data_to_data_url() { let mime = "application/javascript"; let data = "var word = 'hello';\nalert(word);\n"; - let datauri = data_to_dataurl(mime, data.as_bytes()); + let datauri = data_to_data_url(mime, data.as_bytes()); assert_eq!( &datauri, "data:application/javascript;base64,dmFyIHdvcmQgPSAnaGVsbG8nOwphbGVydCh3b3JkKTsK" @@ -43,7 +43,7 @@ fn test_detect_mimetype() { #[test] fn test_url_has_protocol() { - // succeeding + // passing assert_eq!( url_has_protocol("mailto:somebody@somewhere.com?subject=hello"), true @@ -71,16 +71,16 @@ fn test_url_has_protocol() { } #[test] -fn test_is_valid_url() { - // succeeding - assert!(is_valid_url("https://www.rust-lang.org/")); - assert!(is_valid_url("http://kernel.org")); +fn test_is_http_url() { + // passing + assert!(is_http_url("https://www.rust-lang.org/")); + assert!(is_http_url("http://kernel.org")); // failing - assert!(!is_valid_url("//kernel.org")); - assert!(!is_valid_url("./index.html")); - assert!(!is_valid_url("some-local-page.htm")); - assert!(!is_valid_url("ftp://1.2.3.4/www/index.html")); - assert!(!is_valid_url( + assert!(!is_http_url("//kernel.org")); + assert!(!is_http_url("./index.html")); + assert!(!is_http_url("some-local-page.htm")); + assert!(!is_http_url("ftp://1.2.3.4/www/index.html")); + assert!(!is_http_url( "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" )); } @@ -144,20 +144,35 @@ fn test_resolve_url() -> Result<(), ParseError> { "https://www.w3schools.com/html/default.asp" ); + let resolved_url = resolve_url( + "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + "https://www.kernel.org/category/signatures.html", + )?; + assert_eq!( + resolved_url.as_str(), + "https://www.kernel.org/category/signatures.html" + ); + + let resolved_url = resolve_url( + "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h", + "//www.w3schools.com/html/html_iframe.asp", + ) + .unwrap_or(str!()); + assert_eq!(resolved_url.as_str(), ""); + Ok(()) } #[test] fn test_is_data_url() { - // succeeding - assert!( - is_data_url("data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h") - .unwrap_or(false) - ); + // passing + assert!(is_data_url( + "data:text/html;base64,V2VsY29tZSBUbyBUaGUgUGFydHksIDxiPlBhbDwvYj4h" + )); // failing - assert!(!is_data_url("https://kernel.org").unwrap_or(false)); - assert!(!is_data_url("//kernel.org").unwrap_or(false)); - assert!(!is_data_url("").unwrap_or(false)); + assert!(!is_data_url("https://kernel.org")); + assert!(!is_data_url("//kernel.org")); + assert!(!is_data_url("")); } #[test] @@ -175,3 +190,25 @@ fn test_clean_url() { "https://somewhere.com/font.eot" ); } + +#[test] +fn test_data_url_to_text() { + assert_eq!( + data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), + "Work expands so as to fill the time available for its completion" + ); + + assert_eq!( + data_url_to_text( + "data:text/html;utf8,Work expands so as to fill the time available for its completion" + ), + "Work expands so as to fill the time available for its completion" + ); + + assert_eq!( + data_url_to_text( + "data:text/html,Work expands so as to fill the time available for its completion" + ), + "Work expands so as to fill the time available for its completion" + ); +} diff --git a/src/utils.rs b/src/utils.rs index 8b0cc80..c04f9e9 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,9 +1,9 @@ use crate::http::retrieve_asset; -use base64::encode; +use base64::{decode, encode}; use regex::Regex; use reqwest::blocking::Client; use std::collections::HashMap; -use url::{ParseError, Url}; +use url::{form_urlencoded, ParseError, Url}; /// This monster of a regex is used to match any kind of URL found in CSS. /// @@ -37,8 +37,6 @@ use url::{ParseError, Url}; const CSS_URL_REGEX_STR: &str = r###"(?:(?:(?P@import)|(?Psrc\s*:))\s+)?url\((?P['"]?(?P[^"'\)]+)['"]?)\)"###; lazy_static! { - static ref HAS_PROTOCOL: Regex = Regex::new(r"^[a-z0-9]+:").unwrap(); - static ref REGEX_URL: Regex = Regex::new(r"^https?://").unwrap(); static ref REGEX_CSS_URL: Regex = Regex::new(CSS_URL_REGEX_STR).unwrap(); } @@ -67,7 +65,7 @@ const MAGIC: [[&[u8]; 2]; 19] = [ [b"\x1A\x45\xDF\xA3", b"video/webm"], ]; -pub fn data_to_dataurl(mime: &str, data: &[u8]) -> String { +pub fn data_to_data_url(mime: &str, data: &[u8]) -> String { let mimetype = if mime.is_empty() { detect_mimetype(data) } else { @@ -82,23 +80,29 @@ pub fn detect_mimetype(data: &[u8]) -> String { return String::from_utf8(item[1].to_vec()).unwrap(); } } - "".to_owned() + str!() } pub fn url_has_protocol>(url: T) -> bool { - HAS_PROTOCOL.is_match(url.as_ref().to_lowercase().as_str()) + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme().len() > 0)) + .unwrap_or(false) } -pub fn is_data_url>(url: T) -> Result { - Url::parse(url.as_ref()).and_then(|u| Ok(u.scheme() == "data")) +pub fn is_data_url>(url: T) -> bool { + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme() == "data")) + .unwrap_or(false) } -pub fn is_valid_url>(path: T) -> bool { - REGEX_URL.is_match(path.as_ref()) +pub fn is_http_url>(url: T) -> bool { + Url::parse(url.as_ref()) + .and_then(|u| Ok(u.scheme() == "http" || u.scheme() == "https")) + .unwrap_or(false) } pub fn resolve_url, U: AsRef>(from: T, to: U) -> Result { - let result = if is_valid_url(to.as_ref()) { + let result = if is_http_url(to.as_ref()) { to.as_ref().to_string() } else { Url::parse(from.as_ref())? @@ -113,7 +117,7 @@ pub fn resolve_css_imports( cache: &mut HashMap, client: &Client, css_string: &str, - as_dataurl: bool, + as_data_url: bool, href: &str, opt_no_images: bool, opt_silent: bool, @@ -150,7 +154,7 @@ pub fn resolve_css_imports( cache, client, &content, - true, // Finally, convert to a dataurl + true, // Finally, convert to a data URL &embedded_url, opt_no_images, opt_silent, @@ -188,8 +192,8 @@ pub fn resolve_css_imports( resolved_css.replace_range(target_range, &replacement); } - if as_dataurl { - data_to_dataurl("text/css", resolved_css.as_bytes()) + if as_data_url { + data_to_data_url("text/css", resolved_css.as_bytes()) } else { resolved_css } @@ -205,3 +209,61 @@ pub fn clean_url>(url: T) -> String { } result.to_string() } + +pub fn data_url_to_text>(url: T) -> String { + let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap()); + let path: String = parsed_url.path().to_string(); + let comma_loc: usize = path.find(',').unwrap_or(path.len()); + + if comma_loc == path.len() { + return str!(); + } + + let meta_data: String = path.chars().take(comma_loc).collect(); + let raw_data: String = path.chars().skip(comma_loc + 1).collect(); + + let data: String = form_urlencoded::parse(raw_data.as_bytes()) + .map(|(key, val)| { + [ + key.to_string(), + if val.to_string().len() == 0 { + str!() + } else { + str!('=') + }, + val.to_string(), + ] + .concat() + }) + .collect(); + + let meta_data_items: Vec<&str> = meta_data.split(';').collect(); + let mut mime_type: &str = ""; + let mut encoding: &str = ""; + + let mut i: i8 = 0; + for item in &meta_data_items { + if i == 0 { + if item.eq_ignore_ascii_case("text/html") { + mime_type = item; + continue; + } + } + + if item.eq_ignore_ascii_case("base64") || item.eq_ignore_ascii_case("utf8") { + encoding = item; + } + + i = i + 1; + } + + if mime_type.eq_ignore_ascii_case("text/html") { + if encoding.eq_ignore_ascii_case("base64") { + String::from_utf8(decode(&data).unwrap_or(vec![])).unwrap_or(str!()) + } else { + data + } + } else { + str!() + } +}