add support for using cookie file

pull/342/head
Sunshine 1 year ago committed by Sunshine
parent 20c56a5440
commit 78c37958dc

@ -137,9 +137,10 @@ cat index.html | monolith -aIiFfcMv -b https://original.site/ - > result.html
- `-b`: Use custom `base URL`
- `-B`: Forbid retrieving assets from specified domain(s)
- `-c`: Exclude CSS
- `-C`: Save document using custom `charset`
- `-C`: Read cookies from `file`
- `-d`: Allow retrieving assets only from specified `domain(s)`
- `-e`: Ignore network errors
- `-E`: Save document using custom `encoding`
- `-f`: Omit frames
- `-F`: Exclude web fonts
- `-i`: Remove images

@ -0,0 +1,119 @@
use std::time::{SystemTime, UNIX_EPOCH};
use url::Url;
pub struct Cookie {
pub domain: String,
pub include_subdomains: bool,
pub path: String,
pub https_only: bool,
pub expires: u64,
pub name: String,
pub value: String,
}
#[derive(Debug)]
pub enum CookieFileContentsParseError {
InvalidHeader,
}
impl Cookie {
pub fn is_expired(&self) -> bool {
if self.expires == 0 {
return false; // Session, never expires
}
let start = SystemTime::now();
let since_the_epoch = start
.duration_since(UNIX_EPOCH)
.expect("Time went backwards");
self.expires < since_the_epoch.as_secs()
}
pub fn matches_url(&self, url: &str) -> bool {
match Url::parse(&url) {
Ok(url) => {
// Check protocol scheme
match url.scheme() {
"http" => {
if self.https_only {
return false;
}
}
"https" => {}
_ => {
// Should never match URLs of protocols other than HTTP(S)
return false;
}
}
// Check host
if let Some(url_host) = url.host_str() {
if self.domain.starts_with(".") && self.include_subdomains {
if !url_host.to_lowercase().ends_with(&self.domain)
&& !url_host
.eq_ignore_ascii_case(&self.domain[1..self.domain.len() - 1])
{
return false;
}
} else {
if !url_host.eq_ignore_ascii_case(&self.domain) {
return false;
}
}
} else {
return false;
}
// Check path
if !url.path().eq_ignore_ascii_case(&self.path)
&& !url.path().starts_with(&self.path)
{
return false;
}
}
Err(_) => {
return false;
}
}
true
}
}
pub fn parse_cookie_file_contents(
cookie_file_contents: &str,
) -> Result<Vec<Cookie>, CookieFileContentsParseError> {
let mut cookies: Vec<Cookie> = Vec::new();
for (i, line) in cookie_file_contents.lines().enumerate() {
if i == 0 {
// Parsing first line
if !line.eq("# HTTP Cookie File") && !line.eq("# Netscape HTTP Cookie File") {
return Err(CookieFileContentsParseError::InvalidHeader);
}
} else {
// Ignore comment lines
if line.starts_with("#") {
continue;
}
// Attempt to parse values
let mut fields = line.split("\t");
if fields.clone().count() != 7 {
continue;
}
cookies.push(Cookie {
domain: fields.next().unwrap().to_string().to_lowercase(),
include_subdomains: fields.next().unwrap().to_string() == "TRUE",
path: fields.next().unwrap().to_string(),
https_only: fields.next().unwrap().to_string() == "TRUE",
expires: fields.next().unwrap().parse::<u64>().unwrap(),
name: fields.next().unwrap().to_string(),
value: fields.next().unwrap().to_string(),
});
}
}
Ok(cookies)
}

@ -1,3 +1,4 @@
pub mod cookies;
pub mod css;
pub mod html;
pub mod js;

@ -10,6 +10,7 @@ use std::process;
use std::time::Duration;
use url::Url;
use monolith::cookies::parse_cookie_file_contents;
use monolith::html::{
add_favicon, create_metadata_tag, get_base_url, get_charset, has_favicon, html_to_dom,
serialize_document, set_base_url, set_charset, walk_and_embed_assets,
@ -64,7 +65,7 @@ pub fn read_stdin() -> Vec<u8> {
}
fn main() {
let options = Options::from_args();
let mut options = Options::from_args();
// Check if target was provided
if options.target.len() == 0 {
@ -74,10 +75,10 @@ fn main() {
process::exit(1);
}
// Check if custom charset is valid
if let Some(custom_charset) = options.charset.clone() {
if !Encoding::for_label_no_replacement(custom_charset.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_charset);
// Check if custom encoding is valid
if let Some(custom_encoding) = options.encoding.clone() {
if !Encoding::for_label_no_replacement(custom_encoding.as_bytes()).is_some() {
eprintln!("Unknown encoding: {}", &custom_encoding);
process::exit(1);
}
}
@ -139,6 +140,30 @@ fn main() {
},
};
// Read and parse cookie file
if let Some(opt_cookie_file) = options.cookie_file.clone() {
match fs::read_to_string(opt_cookie_file) {
Ok(str) => match parse_cookie_file_contents(&str) {
Ok(cookies) => {
options.cookies = cookies;
// for c in &cookies {
// // if !cookie.is_expired() {
// // options.cookies.append(c);
// // }
// }
}
Err(_) => {
eprintln!("Could not parse specified cookie file");
process::exit(1);
}
},
Err(_) => {
eprintln!("Could not read specified cookie file");
process::exit(1);
}
}
}
// Initialize client
let mut cache = HashMap::new();
let mut header_map = HeaderMap::new();
@ -315,8 +340,8 @@ fn main() {
}
// Save using specified charset, if given
if let Some(custom_charset) = options.charset.clone() {
document_encoding = custom_charset;
if let Some(custom_encoding) = options.encoding.clone() {
document_encoding = custom_encoding;
dom = set_charset(dom, document_encoding.clone());
}

@ -1,15 +1,19 @@
use clap::{App, Arg, ArgAction};
use std::env;
use crate::cookies::Cookie;
#[derive(Default)]
pub struct Options {
pub no_audio: bool,
pub base_url: Option<String>,
pub blacklist_domains: bool,
pub no_css: bool,
pub charset: Option<String>,
pub cookie_file: Option<String>,
pub cookies: Vec<Cookie>,
pub domains: Option<Vec<String>>,
pub ignore_errors: bool,
pub encoding: Option<String>,
pub no_frames: bool,
pub no_fonts: bool,
pub no_images: bool,
@ -48,13 +52,13 @@ impl Options {
.version(env!("CARGO_PKG_VERSION"))
.author(format!("\n{}\n\n", env!("CARGO_PKG_AUTHORS").replace(':', "\n")).as_str())
.about(format!("{}\n{}", ASCII, env!("CARGO_PKG_DESCRIPTION")).as_str())
.args_from_usage("-a, --no-audio 'Removes audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Sets custom base URL'")
.args_from_usage("-a, --no-audio 'Remove audio sources'")
.args_from_usage("-b, --base-url=[http://localhost/] 'Set custom base URL'")
.args_from_usage(
"-B, --blacklist-domains 'Treat list of specified domains as blacklist'",
)
.args_from_usage("-c, --no-css 'Removes CSS'")
.args_from_usage("-C, --charset=[UTF-8] 'Enforces custom encoding'")
.args_from_usage("-c, --no-css 'Remove CSS'")
.args_from_usage("-C, --cookies=[cookies.txt] 'Specify cookie file'")
.arg(
Arg::with_name("domains")
.short('d')
@ -65,23 +69,24 @@ impl Options {
.help("Specify domains to use for white/black-listing"),
)
.args_from_usage("-e, --ignore-errors 'Ignore network errors'")
.args_from_usage("-f, --no-frames 'Removes frames and iframes'")
.args_from_usage("-F, --no-fonts 'Removes fonts'")
.args_from_usage("-i, --no-images 'Removes images'")
.args_from_usage("-I, --isolate 'Cuts off document from the Internet'")
.args_from_usage("-j, --no-js 'Removes JavaScript'")
.args_from_usage("-k, --insecure 'Allows invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Excludes timestamp and source information'")
.args_from_usage("-E, --encoding=[UTF-8] 'Enforce custom charset'")
.args_from_usage("-f, --no-frames 'Remove frames and iframes'")
.args_from_usage("-F, --no-fonts 'Remove fonts'")
.args_from_usage("-i, --no-images 'Remove images'")
.args_from_usage("-I, --isolate 'Cut off document from the Internet'")
.args_from_usage("-j, --no-js 'Remove JavaScript'")
.args_from_usage("-k, --insecure 'Allow invalid X.509 (TLS) certificates'")
.args_from_usage("-M, --no-metadata 'Exclude timestamp and source information'")
.args_from_usage(
"-n, --unwrap-noscript 'Replaces NOSCRIPT elements with their contents'",
"-n, --unwrap-noscript 'Replace NOSCRIPT elements with their contents'",
)
.args_from_usage(
"-o, --output=[document.html] 'Writes output to <file>, use - for STDOUT'",
"-o, --output=[document.html] 'Write output to <file>, use - for STDOUT'",
)
.args_from_usage("-s, --silent 'Suppresses verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjusts network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Sets custom User-Agent string'")
.args_from_usage("-v, --no-video 'Removes video sources'")
.args_from_usage("-s, --silent 'Suppress verbosity'")
.args_from_usage("-t, --timeout=[60] 'Adjust network request timeout'")
.args_from_usage("-u, --user-agent=[Firefox] 'Set custom User-Agent string'")
.args_from_usage("-v, --no-video 'Remove video sources'")
.arg(
Arg::with_name("target")
.required(true)
@ -103,8 +108,11 @@ impl Options {
}
options.blacklist_domains = app.is_present("blacklist-domains");
options.no_css = app.is_present("no-css");
if let Some(charset) = app.value_of("charset") {
options.charset = Some(charset.to_string());
if let Some(cookie_file) = app.value_of("cookies") {
options.cookie_file = Some(cookie_file.to_string());
}
if let Some(encoding) = app.value_of("encoding") {
options.encoding = Some(encoding.to_string());
}
if let Some(domains) = app.get_many::<String>("domains") {
let list_of_domains: Vec<String> = domains.map(|v| v.clone()).collect::<Vec<_>>();

@ -1,5 +1,5 @@
use reqwest::blocking::Client;
use reqwest::header::CONTENT_TYPE;
use reqwest::header::{HeaderMap, HeaderValue, CONTENT_TYPE, COOKIE};
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};
@ -304,7 +304,17 @@ pub fn retrieve_asset(
}
// URL not in cache, we retrieve the file
match client.get(url.as_str()).send() {
let mut headers = HeaderMap::new();
if options.cookies.len() > 0 {
for cookie in &options.cookies {
if !cookie.is_expired() && cookie.matches_url(url.as_str()) {
let cookie_header_value: String = cookie.name.clone() + "=" + &cookie.value;
headers
.insert(COOKIE, HeaderValue::from_str(&cookie_header_value).unwrap());
}
}
}
match client.get(url.as_str()).headers(headers).send() {
Ok(response) => {
if !options.ignore_errors && response.status() != reqwest::StatusCode::OK {
if !options.silent {

@ -115,7 +115,7 @@ mod passing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("-E")
.arg("utf8")
.arg(format!(
"tests{s}_data_{s}unusual_encodings{s}gb2312.html",
@ -158,7 +158,7 @@ mod passing {
let mut cmd = Command::cargo_bin(env!("CARGO_PKG_NAME")).unwrap();
let out = cmd
.arg("-M")
.arg("-C")
.arg("-E")
.arg("utf0")
.arg(format!(
"tests{s}_data_{s}unusual_encodings{s}gb2312.html",

@ -0,0 +1,68 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::cookies;
#[test]
fn never_expires() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.is_expired());
}
#[test]
fn expires_long_from_now() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 9999999999,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.is_expired());
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::cookies;
#[test]
fn expired() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 1,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.is_expired());
}
}

@ -0,0 +1,107 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::cookies;
#[test]
fn secure_url() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: true,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.matches_url("https://127.0.0.1/something"));
}
#[test]
fn non_secure_url() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.matches_url("http://127.0.0.1/something"));
}
#[test]
fn subdomain() {
let cookie = cookies::Cookie {
domain: String::from(".somethingsomething.com"),
include_subdomains: true,
path: String::from("/"),
https_only: true,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(cookie.matches_url("https://cdn.somethingsomething.com/something"));
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::cookies;
#[test]
fn empty_url() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.matches_url(""));
}
#[test]
fn wrong_hostname() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: true,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.matches_url("http://0.0.0.0/"));
}
#[test]
fn wrong_path() {
let cookie = cookies::Cookie {
domain: String::from("127.0.0.1"),
include_subdomains: false,
path: String::from("/"),
https_only: false,
expires: 0,
name: String::from(""),
value: String::from(""),
};
assert!(!cookie.matches_url("http://0.0.0.0/path"));
}
}

@ -0,0 +1,2 @@
mod is_expired;
mod matches_url;

@ -0,0 +1,2 @@
mod cookie;
mod parse_cookie_file_contents;

@ -0,0 +1,87 @@
// ██████╗ █████╗ ███████╗███████╗██╗███╗ ██╗ ██████╗
// ██╔══██╗██╔══██╗██╔════╝██╔════╝██║████╗ ██║██╔════╝
// ██████╔╝███████║███████╗███████╗██║██╔██╗ ██║██║ ███╗
// ██╔═══╝ ██╔══██║╚════██║╚════██║██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║███████║███████║██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚══════╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod passing {
use monolith::cookies;
#[test]
fn parse_file() {
let file_contents =
"# Netscape HTTP Cookie File\n127.0.0.1\tFALSE\t/\tFALSE\t0\tUSER_TOKEN\tin";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 1);
assert_eq!(result[0].domain, "127.0.0.1");
assert_eq!(result[0].include_subdomains, false);
assert_eq!(result[0].path, "/");
assert_eq!(result[0].https_only, false);
assert_eq!(result[0].expires, 0);
assert_eq!(result[0].name, "USER_TOKEN");
assert_eq!(result[0].value, "in");
}
#[test]
fn parse_multiline_file() {
let file_contents = "# HTTP Cookie File\n127.0.0.1\tFALSE\t/\tFALSE\t0\tUSER_TOKEN\tin\n127.0.0.1\tTRUE\t/\tTRUE\t9\tUSER_TOKEN\tout\n\n";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 2);
assert_eq!(result[0].domain, "127.0.0.1");
assert_eq!(result[0].include_subdomains, false);
assert_eq!(result[0].path, "/");
assert_eq!(result[0].https_only, false);
assert_eq!(result[0].expires, 0);
assert_eq!(result[0].name, "USER_TOKEN");
assert_eq!(result[0].value, "in");
assert_eq!(result[1].domain, "127.0.0.1");
assert_eq!(result[1].include_subdomains, true);
assert_eq!(result[1].path, "/");
assert_eq!(result[1].https_only, true);
assert_eq!(result[1].expires, 9);
assert_eq!(result[1].name, "USER_TOKEN");
assert_eq!(result[1].value, "out");
}
}
// ███████╗ █████╗ ██╗██╗ ██╗███╗ ██╗ ██████╗
// ██╔════╝██╔══██╗██║██║ ██║████╗ ██║██╔════╝
// █████╗ ███████║██║██║ ██║██╔██╗ ██║██║ ███╗
// ██╔══╝ ██╔══██║██║██║ ██║██║╚██╗██║██║ ██║
// ██║ ██║ ██║██║███████╗██║██║ ╚████║╚██████╔╝
// ╚═╝ ╚═╝ ╚═╝╚═╝╚══════╝╚═╝╚═╝ ╚═══╝ ╚═════╝
#[cfg(test)]
mod failing {
use monolith::cookies;
#[test]
fn empty() {
let file_contents = "";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 0);
}
#[test]
fn no_header() {
let file_contents = "127.0.0.1 FALSE / FALSE 0 USER_TOKEN in";
match cookies::parse_cookie_file_contents(&file_contents) {
Ok(_result) => {
assert!(false);
}
Err(_e) => {
assert!(true);
}
}
}
#[test]
fn spaces_instead_of_tabs() {
let file_contents =
"# HTTP Cookie File\n127.0.0.1 FALSE / FALSE 0 USER_TOKEN in";
let result = cookies::parse_cookie_file_contents(&file_contents).unwrap();
assert_eq!(result.len(), 0);
}
}

@ -1,4 +1,5 @@
mod cli;
mod cookies;
mod css;
mod html;
mod js;

@ -16,7 +16,8 @@ mod passing {
assert_eq!(options.no_audio, false);
assert_eq!(options.base_url, None);
assert_eq!(options.no_css, false);
assert_eq!(options.charset, None);
assert_eq!(options.cookie_file, None);
assert_eq!(options.encoding, None);
assert_eq!(options.no_frames, false);
assert_eq!(options.no_fonts, false);
assert_eq!(options.no_images, false);

Loading…
Cancel
Save