From 594ad55bd808cc2cc76cac5258cfe063ca4615ef Mon Sep 17 00:00:00 2001 From: Sunshine Date: Fri, 10 Apr 2020 05:06:07 -0400 Subject: [PATCH] improve data URL media type detection --- src/main.rs | 6 +-- src/tests/cli.rs | 2 +- src/tests/utils/data_url_to_text.rs | 59 +++++++++++++++++++++-------- src/utils.rs | 41 ++++++++++---------- 4 files changed, 68 insertions(+), 40 deletions(-) diff --git a/src/main.rs b/src/main.rs index b06b9c8..8243431 100644 --- a/src/main.rs +++ b/src/main.rs @@ -122,9 +122,9 @@ fn main() { base_url = final_url; dom = html_to_dom(&data); } else if is_data_url(target_url) { - let text: String = data_url_to_text(target_url); - if text.len() == 0 { - eprintln!("Unsupported data URL input"); + let (media_type, text): (String, String) = data_url_to_text(target_url); + if !media_type.eq_ignore_ascii_case("text/html") { + eprintln!("Unsupported data URL media type"); process::exit(1); } base_url = str!(target_url); diff --git a/src/tests/cli.rs b/src/tests/cli.rs index 3158483..de2fdf9 100644 --- a/src/tests/cli.rs +++ b/src/tests/cli.rs @@ -62,7 +62,7 @@ fn passing_bad_input_data_url() -> Result<(), Box> { // STDERR should contain error description assert_eq!( std::str::from_utf8(&out.stderr).unwrap(), - "Unsupported data URL input\n" + "Unsupported data URL media type\n" ); // The exit code should be 1 diff --git a/src/tests/utils/data_url_to_text.rs b/src/tests/utils/data_url_to_text.rs index 32e3da2..815a7d6 100644 --- a/src/tests/utils/data_url_to_text.rs +++ b/src/tests/utils/data_url_to_text.rs @@ -9,48 +9,74 @@ use crate::utils; #[test] fn passing_parse_text_html_base64() { + let (media_type, text) = utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="); + + assert_eq!(media_type, "text/html"); assert_eq!( - utils::data_url_to_text("data:text/html;base64,V29yayBleHBhbmRzIHNvIGFzIHRvIGZpbGwgdGhlIHRpbWUgYXZhaWxhYmxlIGZvciBpdHMgY29tcGxldGlvbg=="), + text, "Work expands so as to fill the time available for its completion" ); } #[test] fn passing_parse_text_html_utf8() { + let (media_type, text) = utils::data_url_to_text( + "data:text/html;utf8,Work expands so as to fill the time available for its completion", + ); + + assert_eq!(media_type, "text/html"); assert_eq!( - utils::data_url_to_text( - "data:text/html;utf8,Work expands so as to fill the time available for its completion" - ), + text, "Work expands so as to fill the time available for its completion" ); } #[test] fn passing_parse_text_html_plaintext() { + let (media_type, text) = utils::data_url_to_text( + "data:text/html,Work expands so as to fill the time available for its completion", + ); + + assert_eq!(media_type, "text/html"); assert_eq!( - utils::data_url_to_text( - "data:text/html,Work expands so as to fill the time available for its completion" - ), + text, "Work expands so as to fill the time available for its completion" ); } #[test] fn passing_parse_text_html_charset_utf_8_between_two_whitespaces() { + let (media_type, text) = utils::data_url_to_text(" data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion "); + + assert_eq!(media_type, "text/html"); assert_eq!( - utils::data_url_to_text( - " data:text/html;charset=utf-8,Work expands so as to fill the time available for its completion " - ), + text, "Work expands so as to fill the time available for its completion" ); } #[test] fn passing_parse_text_css_url_encoded() { - assert_eq!( - utils::data_url_to_text("data:text/css,div{background-color:%23000}"), - "div{background-color:#000}" - ); + let (media_type, text) = utils::data_url_to_text("data:text/css,div{background-color:%23000}"); + + assert_eq!(media_type, "text/css"); + assert_eq!(text, "div{background-color:#000}"); +} + +#[test] +fn passing_parse_no_media_type_base64() { + let (media_type, text) = utils::data_url_to_text("data:;base64,dGVzdA=="); + + assert_eq!(media_type, ""); + assert_eq!(text, "test"); +} + +#[test] +fn passing_parse_no_media_type_no_encoding() { + let (media_type, text) = utils::data_url_to_text("data:;,test%20test"); + + assert_eq!(media_type, ""); + assert_eq!(text, "test test"); } // ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ @@ -62,5 +88,8 @@ fn passing_parse_text_css_url_encoded() { #[test] fn failing_just_word_data() { - assert_eq!(utils::data_url_to_text("data"), ""); + let (media_type, text) = utils::data_url_to_text("data"); + + assert_eq!(media_type, ""); + assert_eq!(text, ""); } diff --git a/src/utils.rs b/src/utils.rs index 885f1c3..6221837 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -133,50 +133,47 @@ pub fn clean_url>(url: T) -> String { result.to_string() } -pub fn data_url_to_text>(url: T) -> String { - let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("http://[::1]").unwrap()); +pub fn data_url_to_text>(url: T) -> (String, String) { + let parsed_url = Url::parse(url.as_ref()).unwrap_or(Url::parse("data:,").unwrap()); let path: String = parsed_url.path().to_string(); let comma_loc: usize = path.find(',').unwrap_or(path.len()); - if comma_loc == path.len() { - return str!(); - } - let meta_data: String = path.chars().take(comma_loc).collect(); let raw_data: String = path.chars().skip(comma_loc + 1).collect(); let data: String = decode_url(raw_data); let meta_data_items: Vec<&str> = meta_data.split(';').collect(); - let mut media_type: &str = ""; let mut encoding: &str = ""; - // Detect media type and encoding + let mut media_type: String = str!(); + let mut text: String = str!(); + let mut i: i8 = 0; for item in &meta_data_items { if i == 0 { - if is_plaintext_media_type(item) { - media_type = item; - continue; + media_type = str!(item); + } else { + if item.eq_ignore_ascii_case("base64") + || item.eq_ignore_ascii_case("utf8") + || item.eq_ignore_ascii_case("charset=UTF-8") + { + encoding = item; } } - if item.eq_ignore_ascii_case("base64") || item.eq_ignore_ascii_case("utf8") { - encoding = item; - } - i = i + 1; } - if is_plaintext_media_type(media_type) { + if is_plaintext_media_type(&media_type) || media_type.is_empty() { if encoding.eq_ignore_ascii_case("base64") { - String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!()) + text = String::from_utf8(base64::decode(&data).unwrap_or(vec![])).unwrap_or(str!()) } else { - data + text = data } - } else { - str!() } + + (media_type, text) } pub fn decode_url(input: String) -> String { @@ -238,7 +235,9 @@ pub fn retrieve_asset( if as_data_url { Ok((url.to_string(), url.to_string())) } else { - Ok((data_url_to_text(url), url.to_string())) + let (_media_type, text) = data_url_to_text(url); + + Ok((text, url.to_string())) } } else if is_file_url(&url) { // Check if parent_url is also file:///