From 7e298b0b02cdadf449f95f4b72c922f37491d398 Mon Sep 17 00:00:00 2001 From: Vincent Flyson Date: Fri, 23 Aug 2019 14:33:18 -0400 Subject: [PATCH] Add -u flag for custom User-Agent --- Cargo.toml | 3 +-- README.md | 5 +---- src/html.rs | 48 +++++++++++++++++++++++++++++++++++++++--------- src/http.rs | 9 +++++++-- src/main.rs | 8 ++++++-- 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index f841eca..ca1b409 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "monolith" -version = "2.0.4" +version = "2.0.5" authors = ["Sunshine "] description = "CLI tool to save webpages as a single HTML file" @@ -14,4 +14,3 @@ regex = "1.2.1" reqwest = "0.9.20" url = "2.1.0" lazy_static = "1.3.0" - diff --git a/README.md b/README.md index 8e4b6e0..9c04c28 100644 --- a/README.md +++ b/README.md @@ -17,14 +17,11 @@ If compared to saving websites with `wget -mpk`, `monolith` embeds all assets as ### Usage $ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html - - - ### Options - `-i`: Remove images - `-j`: Exclude JavaScript - + - `-u`: Specify custom User-Agent ### License The Unlicense diff --git a/src/html.rs b/src/html.rs index 840afb6..63c1d7d 100644 --- a/src/html.rs +++ b/src/html.rs @@ -47,13 +47,23 @@ const JS_DOM_EVENT_ATTRS: [&str; 21] = [ "onresize", ]; -#[allow(clippy::cognitive_complexity)] -pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_images: bool) { +pub fn walk_and_embed_assets( + url: &str, + node: &Handle, + opt_no_js: bool, + opt_no_images: bool, + opt_user_agent: &str, +) { match node.data { NodeData::Document => { // Dig deeper for child in node.children.borrow().iter() { - walk_and_embed_assets(&url, child, opt_no_js, opt_no_images); + walk_and_embed_assets( + &url, child, + opt_no_js, + opt_no_images, + opt_user_agent, + ); } } @@ -102,7 +112,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { let href_full_url = resolve_url(&url, &attr.value.to_string()); - let favicon_datauri = retrieve_asset(&href_full_url.unwrap(), true, ""); + let favicon_datauri = retrieve_asset( + &href_full_url.unwrap(), + true, + "", + opt_user_agent, + ); attr.value.clear(); attr.value.push_slice(favicon_datauri.unwrap().as_str()); } @@ -116,7 +131,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i attr.value.push_slice(PNG_PIXEL); } else { let src_full_url = resolve_url(&url, &attr.value.to_string()); - let img_datauri = retrieve_asset(&src_full_url.unwrap(), true, ""); + let img_datauri = retrieve_asset( + &src_full_url.unwrap(), + true, + "", + opt_user_agent, + ); attr.value.clear(); attr.value.push_slice(img_datauri.unwrap().as_str()); } @@ -141,8 +161,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i for attr in attrs_mut.iter_mut() { if &attr.name.local == "href" { let href_full_url = resolve_url(&url, &attr.value.to_string()); - let css_datauri = - retrieve_asset(&href_full_url.unwrap(), true, "text/css"); + let css_datauri = retrieve_asset( + &href_full_url.unwrap(), + true, + "text/css", + opt_user_agent, + ); attr.value.clear(); attr.value.push_slice(css_datauri.unwrap().as_str()); } @@ -165,6 +189,7 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i &src_full_url.unwrap(), true, "application/javascript", + opt_user_agent, ); attr.value.clear(); attr.value.push_slice(js_datauri.unwrap().as_str()); @@ -200,7 +225,13 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i // Dig deeper for child in node.children.borrow().iter() { - walk_and_embed_assets(&url, child, opt_no_js, opt_no_images); + walk_and_embed_assets( + &url, + child, + opt_no_js, + opt_no_images, + opt_user_agent, + ); } } @@ -216,7 +247,6 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom { } pub fn print_dom(handle: &Handle) { - // TODO: append to the if opt_isolate serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap(); } diff --git a/src/http.rs b/src/http.rs index e8be4b3..19e064d 100644 --- a/src/http.rs +++ b/src/http.rs @@ -1,5 +1,5 @@ use regex::Regex; -use reqwest::header::CONTENT_TYPE; +use reqwest::header::{CONTENT_TYPE, USER_AGENT}; use reqwest::Client; use std::time::Duration; use url::{ParseError, Url}; @@ -69,6 +69,7 @@ pub fn retrieve_asset( url: &str, as_dataurl: bool, as_mime: &str, + opt_user_agent: &str, ) -> Result { if url_is_data(&url).unwrap() { Ok(url.to_string()) @@ -77,7 +78,11 @@ pub fn retrieve_asset( .timeout(Duration::from_secs(10)) .build() .unwrap(); - let mut response = client.get(url).send().unwrap(); + let mut response = client + .get(url) + .header(USER_AGENT, opt_user_agent) + .send() + .unwrap(); if as_dataurl { // Convert response into a byte array diff --git a/src/main.rs b/src/main.rs index 1492e9e..cfb4872 100644 --- a/src/main.rs +++ b/src/main.rs @@ -6,6 +6,8 @@ use clap::{App, Arg}; use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets}; use monolith::http::{is_valid_url, retrieve_asset}; +static DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"; + fn main() { let command = App::new("monolith") .version(crate_version!()) @@ -20,18 +22,20 @@ fn main() { ) .args_from_usage("-j, --no-js 'Excludes JavaScript'") .args_from_usage("-i, --no-images 'Removes images'") + .args_from_usage("-u, --user-agent= 'Custom User-Agent string'") .get_matches(); // Process the command let arg_target = command.value_of("url").unwrap(); let opt_no_js = command.is_present("no-js"); let opt_no_img = command.is_present("no-images"); + let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT); if is_valid_url(arg_target) { - let data = retrieve_asset(&arg_target, false, ""); + let data = retrieve_asset(&arg_target, false, "", opt_user_agent); let dom = html_to_dom(&data.unwrap()); - walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img); + walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent); print_dom(&dom.document); println!(); // Ensure newline at end of output