diff --git a/README.md b/README.md index 363c7b7..ab063c3 100644 --- a/README.md +++ b/README.md @@ -137,9 +137,10 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html - `-b`: Use custom `base URL` - `-B`: Forbid retrieving assets from specified domain(s) - `-c`: Exclude CSS - - `-C`: Save document using custom `charset` + - `-C`: Read cookies from `file` - `-d`: Allow retrieving assets only from specified `domain(s)` - `-e`: Ignore network errors + - `-E`: Save document using custom `encoding` - `-f`: Omit frames - `-F`: Exclude web fonts - `-i`: Remove images diff --git a/src/cookies.rs b/src/cookies.rs new file mode 100644 index 0000000..85eeebb --- /dev/null +++ b/src/cookies.rs @@ -0,0 +1,119 @@ +use std::time::{SystemTime, UNIX_EPOCH}; +use url::Url; + +pub struct Cookie { + pub domain: String, + pub include_subdomains: bool, + pub path: String, + pub https_only: bool, + pub expires: u64, + pub name: String, + pub value: String, +} + +#[derive(Debug)] +pub enum CookieFileContentsParseError { + InvalidHeader, +} + +impl Cookie { + pub fn is_expired(&self) -> bool { + if self.expires == 0 { + return false; // Session, never expires + } + + let start = SystemTime::now(); + let since_the_epoch = start + .duration_since(UNIX_EPOCH) + .expect("Time went backwards"); + + self.expires < since_the_epoch.as_secs() + } + + pub fn matches_url(&self, url: &str) -> bool { + match Url::parse(&url) { + Ok(url) => { + // Check protocol scheme + match url.scheme() { + "http" => { + if self.https_only { + return false; + } + } + "https" => {} + _ => { + // Should never match URLs of protocols other than HTTP(S) + return false; + } + } + + // Check host + if let Some(url_host) = url.host_str() { + if self.domain.starts_with(".") && self.include_subdomains { + if !url_host.to_lowercase().ends_with(&self.domain) + && !url_host + .eq_ignore_ascii_case(&self.domain[1..self.domain.len() - 1]) + { + return false; + } + } else { + if !url_host.eq_ignore_ascii_case(&self.domain) { + return false; + } + } + } else { + return false; + } + + // Check path + if !url.path().eq_ignore_ascii_case(&self.path) + && !url.path().starts_with(&self.path) + { + return false; + } + } + Err(_) => { + return false; + } + } + + true + } +} + +pub fn parse_cookie_file_contents( + cookie_file_contents: &str, +) -> Result, CookieFileContentsParseError> { + let mut cookies: Vec = Vec::new(); + + for (i, line) in cookie_file_contents.lines().enumerate() { + if i == 0 { + // Parsing first line + if !line.eq("# HTTP Cookie File") && !line.eq("# Netscape HTTP Cookie File") { + return Err(CookieFileContentsParseError::InvalidHeader); + } + } else { + // Ignore comment lines + if line.starts_with("#") { + continue; + } + + // Attempt to parse values + let mut fields = line.split("\t"); + if fields.clone().count() != 7 { + continue; + } + cookies.push(Cookie { + domain: fields.next().unwrap().to_string().to_lowercase(), + include_subdomains: fields.next().unwrap().to_string() == "TRUE", + path: fields.next().unwrap().to_string(), + https_only: fields.next().unwrap().to_string() == "TRUE", + expires: fields.next().unwrap().parse::().unwrap(), + name: fields.next().unwrap().to_string(), + value: fields.next().unwrap().to_string(), + }); + } + } + + Ok(cookies) +} diff --git a/src/lib.rs b/src/lib.rs index 57cd530..460d388 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +pub mod cookies; pub mod css; pub mod html; pub mod js; diff --git a/src/main.rs b/src/main.rs index 0517d57..fdd2359 100644 --- a/src/main.rs +++ b/src/main.rs @@ -10,6 +10,7 @@ use std::process; use std::time::Duration; use url::Url; +use monolith::cookies::parse_cookie_file_contents; use monolith::html::{ add_favicon, create_metadata_tag, get_base_url, get_charset, has_favicon, html_to_dom, serialize_document, set_base_url, set_charset, walk_and_embed_assets, @@ -64,7 +65,7 @@ pub fn read_stdin() -> Vec { } fn main() { - let options = Options::from_args(); + let mut options = Options::from_args(); // Check if target was provided if options.target.len() == 0 { @@ -74,10 +75,10 @@ fn main() { process::exit(1); } - // Check if custom charset is valid - if let Some(custom_charset) = options.charset.clone() { - if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() { - eprintln!("Unknown encoding: {}", &custom_charset); + // Check if custom encoding is valid + if let Some(custom_encoding) = options.encoding.clone() { + if !Encoding::for_label_no_replacement(custom_encoding.as_bytes()).is_some() { + eprintln!("Unknown encoding: {}", &custom_encoding); process::exit(1); } } @@ -139,6 +140,30 @@ fn main() { }, }; + // Read and parse cookie file + if let Some(opt_cookie_file) = options.cookie_file.clone() { + match fs::read_to_string(opt_cookie_file) { + Ok(str) => match parse_cookie_file_contents(&str) { + Ok(cookies) => { + options.cookies = cookies; + // for c in &cookies { + // // if !cookie.is_expired() { + // // options.cookies.append(c); + // // } + // } + } + Err(_) => { + eprintln!("Could not parse specified cookie file"); + process::exit(1); + } + }, + Err(_) => { + eprintln!("Could not read specified cookie file"); + process::exit(1); + } + } + } + // Initialize client let mut cache = HashMap::new(); let mut header_map = HeaderMap::new(); @@ -315,8 +340,8 @@ fn main() { } // Save using specified charset, if given - if let Some(custom_charset) = options.charset.clone() { - document_encoding = custom_charset; + if let Some(custom_encoding) = options.encoding.clone() { + document_encoding = custom_encoding; dom = set_charset(dom, document_encoding.clone()); } diff --git a/src/opts.rs b/src/opts.rs index bee5d86..7a90694 100644 --- a/src/opts.rs +++ b/src/opts.rs @@ -1,15 +1,19 @@ use clap::{App, Arg, ArgAction}; use std::env; +use crate::cookies::Cookie; + #[derive(Default)] pub struct Options { pub no_audio: bool, pub base_url: Option, pub blacklist_domains: bool, pub no_css: bool, - pub charset: Option, + pub cookie_file: Option, + pub cookies: Vec, pub domains: Option>, pub ignore_errors: bool, + pub encoding: Option, pub no_frames: bool, pub no_fonts: bool, pub no_images: bool, @@ -48,13 +52,13 @@ impl Options { .version(env!("CARGO_PKG_VERSION")) .author(format!("\n{}\n\n", env!("CARGO_PKG_AUTHORS").replace(':', "\n")).as_str()) .about(format!("{}\n{}", ASCII, env!("CARGO_PKG_DESCRIPTION")).as_str()) - .args_from_usage("-a, --no-audio 'Removes audio sources'") - .args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'") + .args_from_usage("-a, --no-audio 'Remove audio sources'") + .args_from_usage("-b, --base-url=[http://localhost/] 'Set custom base URL'") .args_from_usage( "-B, --blacklist-domains 'Treat list of specified domains as blacklist'", ) - .args_from_usage("-c, --no-css 'Removes CSS'") - .args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'") + .args_from_usage("-c, --no-css 'Remove CSS'") + .args_from_usage("-C, --cookies=[cookies.txt] 'Specify cookie file'") .arg( Arg::with_name("domains") .short('d') @@ -65,23 +69,24 @@ impl Options { .help("Specify domains to use for white/black-listing"), ) .args_from_usage("-e, --ignore-errors 'Ignore network errors'") - .args_from_usage("-f, --no-frames 'Removes frames and iframes'") - .args_from_usage("-F, --no-fonts 'Removes fonts'") - .args_from_usage("-i, --no-images 'Removes images'") - .args_from_usage("-I, --isolate 'Cuts off document from the Internet'") - .args_from_usage("-j, --no-js 'Removes JavaScript'") - .args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'") - .args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'") + .args_from_usage("-E, --encoding=[UTF-8] 'Enforce custom charset'") + .args_from_usage("-f, --no-frames 'Remove frames and iframes'") + .args_from_usage("-F, --no-fonts 'Remove fonts'") + .args_from_usage("-i, --no-images 'Remove images'") + .args_from_usage("-I, --isolate 'Cut off document from the Internet'") + .args_from_usage("-j, --no-js 'Remove JavaScript'") + .args_from_usage("-k, --insecure 'Allow invalid X.509 (TLS) certificates'") + .args_from_usage("-M, --no-metadata 'Exclude timestamp and source information'") .args_from_usage( - "-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'", + "-n, --unwrap-noscript 'Replace NOSCRIPT elements with their contents'", ) .args_from_usage( - "-o, --output=[document.html] 'Writes output to , use - for STDOUT'", + "-o, --output=[document.html] 'Write output to , use - for STDOUT'", ) - .args_from_usage("-s, --silent 'Suppresses verbosity'") - .args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'") - .args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'") - .args_from_usage("-v, --no-video 'Removes video sources'") + .args_from_usage("-s, --silent 'Suppress verbosity'") + .args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'") + .args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'") + .args_from_usage("-v, --no-video 'Remove video sources'") .arg( Arg::with_name("target") .required(true) @@ -103,8 +108,11 @@ impl Options { } options.blacklist_domains = app.is_present("blacklist-domains"); options.no_css = app.is_present("no-css"); - if let Some(charset) = app.value_of("charset") { - options.charset = Some(charset.to_string()); + if let Some(cookie_file) = app.value_of("cookies") { + options.cookie_file = Some(cookie_file.to_string()); + } + if let Some(encoding) = app.value_of("encoding") { + options.encoding = Some(encoding.to_string()); } if let Some(domains) = app.get_many::("domains") { let list_of_domains: Vec = domains.map(|v| v.clone()).collect::>(); diff --git a/src/utils.rs b/src/utils.rs index 621a083..d6ae520 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,5 +1,5 @@ use reqwest::blocking::Client; -use reqwest::header::CONTENT_TYPE; +use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE, COOKIE}; use std::collections::HashMap; use std::fs; use std::path::{Path, PathBuf}; @@ -304,7 +304,17 @@ pub fn retrieve_asset( } // URL not in cache, we retrieve the file - match client.get(url.as_str()).send() { + let mut headers = HeaderMap::new(); + if options.cookies.len() > 0 { + for cookie in &options.cookies { + if !cookie.is_expired() && cookie.matches_url(url.as_str()) { + let cookie_header_value: String = cookie.name.clone() + "=" + &cookie.value; + headers + .insert(COOKIE, HeaderValue::from_str(&cookie_header_value).unwrap()); + } + } + } + match client.get(url.as_str()).headers(headers).send() { Ok(response) => { if !options.ignore_errors && response.status() != reqwest::StatusCode::OK { if !options.silent { diff --git a/tests/cli/unusual_encodings.rs b/tests/cli/unusual_encodings.rs index 2a68c37..922922a 100644 --- a/tests/cli/unusual_encodings.rs +++ b/tests/cli/unusual_encodings.rs @@ -115,7 +115,7 @@ mod passing { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd .arg("-M") - .arg("-C") + .arg("-E") .arg("utf8") .arg(format!( "tests{s}_data_{s}unusual_encodings{s}gb2312.html", @@ -158,7 +158,7 @@ mod passing { let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap(); let out = cmd .arg("-M") - .arg("-C") + .arg("-E") .arg("utf0") .arg(format!( "tests{s}_data_{s}unusual_encodings{s}gb2312.html", diff --git a/tests/cookies/cookie/is_expired.rs b/tests/cookies/cookie/is_expired.rs new file mode 100644 index 0000000..6bb479c --- /dev/null +++ b/tests/cookies/cookie/is_expired.rs @@ -0,0 +1,68 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use monolith::cookies; + + #[test] + fn never_expires() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: false, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + + assert!(!cookie.is_expired()); + } + + #[test] + fn expires_long_from_now() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: false, + expires: 9999999999, + name: String::from(""), + value: String::from(""), + }; + + assert!(!cookie.is_expired()); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use monolith::cookies; + + #[test] + fn expired() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: false, + expires: 1, + name: String::from(""), + value: String::from(""), + }; + + assert!(cookie.is_expired()); + } +} diff --git a/tests/cookies/cookie/matches_url.rs b/tests/cookies/cookie/matches_url.rs new file mode 100644 index 0000000..95dba63 --- /dev/null +++ b/tests/cookies/cookie/matches_url.rs @@ -0,0 +1,107 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use monolith::cookies; + + #[test] + fn secure_url() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: true, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + assert!(cookie.matches_url("https://127.0.0.1/something")); + } + + #[test] + fn non_secure_url() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: false, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + assert!(cookie.matches_url("http://127.0.0.1/something")); + } + + #[test] + fn subdomain() { + let cookie = cookies::Cookie { + domain: String::from(".somethingsomething.com"), + include_subdomains: true, + path: String::from("/"), + https_only: true, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + assert!(cookie.matches_url("https://cdn.somethingsomething.com/something")); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use monolith::cookies; + + #[test] + fn empty_url() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: false, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + assert!(!cookie.matches_url("")); + } + + #[test] + fn wrong_hostname() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: true, + path: String::from("/"), + https_only: false, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + assert!(!cookie.matches_url("http://0.0.0.0/")); + } + + #[test] + fn wrong_path() { + let cookie = cookies::Cookie { + domain: String::from("127.0.0.1"), + include_subdomains: false, + path: String::from("/"), + https_only: false, + expires: 0, + name: String::from(""), + value: String::from(""), + }; + assert!(!cookie.matches_url("http://0.0.0.0/path")); + } +} diff --git a/tests/cookies/cookie/mod.rs b/tests/cookies/cookie/mod.rs new file mode 100644 index 0000000..91b2457 --- /dev/null +++ b/tests/cookies/cookie/mod.rs @@ -0,0 +1,2 @@ +mod is_expired; +mod matches_url; diff --git a/tests/cookies/mod.rs b/tests/cookies/mod.rs new file mode 100644 index 0000000..973fa12 --- /dev/null +++ b/tests/cookies/mod.rs @@ -0,0 +1,2 @@ +mod cookie; +mod parse_cookie_file_contents; diff --git a/tests/cookies/parse_cookie_file_contents.rs b/tests/cookies/parse_cookie_file_contents.rs new file mode 100644 index 0000000..28857de --- /dev/null +++ b/tests/cookies/parse_cookie_file_contents.rs @@ -0,0 +1,87 @@ +// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗ +// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝ +// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗ +// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod passing { + use monolith::cookies; + + #[test] + fn parse_file() { + let file_contents = + "# Netscape HTTP Cookie File\n127.0.0.1\tFALSE\t/\tFALSE\t0\tUSER_TOKEN\tin"; + let result = cookies::parse_cookie_file_contents(&file_contents).unwrap(); + assert_eq!(result.len(), 1); + assert_eq!(result[0].domain, "127.0.0.1"); + assert_eq!(result[0].include_subdomains, false); + assert_eq!(result[0].path, "/"); + assert_eq!(result[0].https_only, false); + assert_eq!(result[0].expires, 0); + assert_eq!(result[0].name, "USER_TOKEN"); + assert_eq!(result[0].value, "in"); + } + + #[test] + fn parse_multiline_file() { + let file_contents = "# HTTP Cookie File\n127.0.0.1\tFALSE\t/\tFALSE\t0\tUSER_TOKEN\tin\n127.0.0.1\tTRUE\t/\tTRUE\t9\tUSER_TOKEN\tout\n\n"; + let result = cookies::parse_cookie_file_contents(&file_contents).unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0].domain, "127.0.0.1"); + assert_eq!(result[0].include_subdomains, false); + assert_eq!(result[0].path, "/"); + assert_eq!(result[0].https_only, false); + assert_eq!(result[0].expires, 0); + assert_eq!(result[0].name, "USER_TOKEN"); + assert_eq!(result[0].value, "in"); + assert_eq!(result[1].domain, "127.0.0.1"); + assert_eq!(result[1].include_subdomains, true); + assert_eq!(result[1].path, "/"); + assert_eq!(result[1].https_only, true); + assert_eq!(result[1].expires, 9); + assert_eq!(result[1].name, "USER_TOKEN"); + assert_eq!(result[1].value, "out"); + } +} + +// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗ +// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝ +// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗ +// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║ +// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝ +// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝ + +#[cfg(test)] +mod failing { + use monolith::cookies; + + #[test] + fn empty() { + let file_contents = ""; + let result = cookies::parse_cookie_file_contents(&file_contents).unwrap(); + assert_eq!(result.len(), 0); + } + + #[test] + fn no_header() { + let file_contents = "127.0.0.1 FALSE / FALSE 0 USER_TOKEN in"; + match cookies::parse_cookie_file_contents(&file_contents) { + Ok(_result) => { + assert!(false); + } + Err(_e) => { + assert!(true); + } + } + } + + #[test] + fn spaces_instead_of_tabs() { + let file_contents = + "# HTTP Cookie File\n127.0.0.1 FALSE / FALSE 0 USER_TOKEN in"; + let result = cookies::parse_cookie_file_contents(&file_contents).unwrap(); + assert_eq!(result.len(), 0); + } +} diff --git a/tests/mod.rs b/tests/mod.rs index 3ce2821..0f9928c 100644 --- a/tests/mod.rs +++ b/tests/mod.rs @@ -1,4 +1,5 @@ mod cli; +mod cookies; mod css; mod html; mod js; diff --git a/tests/opts.rs b/tests/opts.rs index f307ec3..a015f6e 100644 --- a/tests/opts.rs +++ b/tests/opts.rs @@ -16,7 +16,8 @@ mod passing { assert_eq!(options.no_audio, false); assert_eq!(options.base_url, None); assert_eq!(options.no_css, false); - assert_eq!(options.charset, None); + assert_eq!(options.cookie_file, None); + assert_eq!(options.encoding, None); assert_eq!(options.no_frames, false); assert_eq!(options.no_fonts, false); assert_eq!(options.no_images, false);