Merge pull request #22 from Y2Z/user-agent

Add -u flag for custom User-Agent
pull/23/head
Vincent Flyson 5 years ago committed by GitHub
commit 75969c9943
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,6 +1,6 @@
[package] [package]
name = "monolith" name = "monolith"
version = "2.0.4" version = "2.0.5"
authors = ["Sunshine <sunshine@uberspace.net>"] authors = ["Sunshine <sunshine@uberspace.net>"]
description = "CLI tool to save webpages as a single HTML file" description = "CLI tool to save webpages as a single HTML file"
@ -14,4 +14,3 @@ regex = "1.2.1"
reqwest = "0.9.20" reqwest = "0.9.20"
url = "2.1.0" url = "2.1.0"
lazy_static = "1.3.0" lazy_static = "1.3.0"

@ -17,14 +17,11 @@ If compared to saving websites with `wget -mpk`, `monolith` embeds all assets as
### Usage ### Usage
$ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html $ monolith https://lyrics.github.io/db/p/portishead/dummy/roads/ > portishead-roads-lyrics.html
<!-- or -->
<!-- cat local.html | monolith - > local.html -->
### Options ### Options
- `-i`: Remove images - `-i`: Remove images
- `-j`: Exclude JavaScript - `-j`: Exclude JavaScript
<!-- - `-a`: Don't make anchors link to remote documents --> - `-u`: Specify custom User-Agent
### License ### License
The Unlicense The Unlicense

@ -47,13 +47,23 @@ const JS_DOM_EVENT_ATTRS: [&str; 21] = [
"onresize", "onresize",
]; ];
#[allow(clippy::cognitive_complexity)] pub fn walk_and_embed_assets(
pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_images: bool) { url: &str,
node: &Handle,
opt_no_js: bool,
opt_no_images: bool,
opt_user_agent: &str,
) {
match node.data { match node.data {
NodeData::Document => { NodeData::Document => {
// Dig deeper // Dig deeper
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets(&url, child, opt_no_js, opt_no_images); walk_and_embed_assets(
&url, child,
opt_no_js,
opt_no_images,
opt_user_agent,
);
} }
} }
@ -102,7 +112,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
let href_full_url = resolve_url(&url, &attr.value.to_string()); let href_full_url = resolve_url(&url, &attr.value.to_string());
let favicon_datauri = retrieve_asset(&href_full_url.unwrap(), true, ""); let favicon_datauri = retrieve_asset(
&href_full_url.unwrap(),
true,
"",
opt_user_agent,
);
attr.value.clear(); attr.value.clear();
attr.value.push_slice(favicon_datauri.unwrap().as_str()); attr.value.push_slice(favicon_datauri.unwrap().as_str());
} }
@ -116,7 +131,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
attr.value.push_slice(PNG_PIXEL); attr.value.push_slice(PNG_PIXEL);
} else { } else {
let src_full_url = resolve_url(&url, &attr.value.to_string()); let src_full_url = resolve_url(&url, &attr.value.to_string());
let img_datauri = retrieve_asset(&src_full_url.unwrap(), true, ""); let img_datauri = retrieve_asset(
&src_full_url.unwrap(),
true,
"",
opt_user_agent,
);
attr.value.clear(); attr.value.clear();
attr.value.push_slice(img_datauri.unwrap().as_str()); attr.value.push_slice(img_datauri.unwrap().as_str());
} }
@ -141,8 +161,12 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
for attr in attrs_mut.iter_mut() { for attr in attrs_mut.iter_mut() {
if &attr.name.local == "href" { if &attr.name.local == "href" {
let href_full_url = resolve_url(&url, &attr.value.to_string()); let href_full_url = resolve_url(&url, &attr.value.to_string());
let css_datauri = let css_datauri = retrieve_asset(
retrieve_asset(&href_full_url.unwrap(), true, "text/css"); &href_full_url.unwrap(),
true,
"text/css",
opt_user_agent,
);
attr.value.clear(); attr.value.clear();
attr.value.push_slice(css_datauri.unwrap().as_str()); attr.value.push_slice(css_datauri.unwrap().as_str());
} }
@ -165,6 +189,7 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
&src_full_url.unwrap(), &src_full_url.unwrap(),
true, true,
"application/javascript", "application/javascript",
opt_user_agent,
); );
attr.value.clear(); attr.value.clear();
attr.value.push_slice(js_datauri.unwrap().as_str()); attr.value.push_slice(js_datauri.unwrap().as_str());
@ -200,7 +225,13 @@ pub fn walk_and_embed_assets(url: &str, node: &Handle, opt_no_js: bool, opt_no_i
// Dig deeper // Dig deeper
for child in node.children.borrow().iter() { for child in node.children.borrow().iter() {
walk_and_embed_assets(&url, child, opt_no_js, opt_no_images); walk_and_embed_assets(
&url,
child,
opt_no_js,
opt_no_images,
opt_user_agent,
);
} }
} }
@ -216,7 +247,6 @@ pub fn html_to_dom(data: &str) -> html5ever::rcdom::RcDom {
} }
pub fn print_dom(handle: &Handle) { pub fn print_dom(handle: &Handle) {
// TODO: append <meta http-equiv="Access-Control-Allow-Origin" content="'self'"/> to the <head> if opt_isolate
serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap(); serialize(&mut io::stdout(), handle, SerializeOpts::default()).unwrap();
} }

@ -1,5 +1,5 @@
use regex::Regex; use regex::Regex;
use reqwest::header::CONTENT_TYPE; use reqwest::header::{CONTENT_TYPE, USER_AGENT};
use reqwest::Client; use reqwest::Client;
use std::time::Duration; use std::time::Duration;
use url::{ParseError, Url}; use url::{ParseError, Url};
@ -69,6 +69,7 @@ pub fn retrieve_asset(
url: &str, url: &str,
as_dataurl: bool, as_dataurl: bool,
as_mime: &str, as_mime: &str,
opt_user_agent: &str,
) -> Result<String, reqwest::Error> { ) -> Result<String, reqwest::Error> {
if url_is_data(&url).unwrap() { if url_is_data(&url).unwrap() {
Ok(url.to_string()) Ok(url.to_string())
@ -77,7 +78,11 @@ pub fn retrieve_asset(
.timeout(Duration::from_secs(10)) .timeout(Duration::from_secs(10))
.build() .build()
.unwrap(); .unwrap();
let mut response = client.get(url).send().unwrap(); let mut response = client
.get(url)
.header(USER_AGENT, opt_user_agent)
.send()
.unwrap();
if as_dataurl { if as_dataurl {
// Convert response into a byte array // Convert response into a byte array

@ -6,6 +6,8 @@ use clap::{App, Arg};
use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets}; use monolith::html::{html_to_dom, print_dom, walk_and_embed_assets};
use monolith::http::{is_valid_url, retrieve_asset}; use monolith::http::{is_valid_url, retrieve_asset};
static DEFAULT_USER_AGENT: &str = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0";
fn main() { fn main() {
let command = App::new("monolith") let command = App::new("monolith")
.version(crate_version!()) .version(crate_version!())
@ -20,18 +22,20 @@ fn main() {
) )
.args_from_usage("-j, --no-js 'Excludes JavaScript'") .args_from_usage("-j, --no-js 'Excludes JavaScript'")
.args_from_usage("-i, --no-images 'Removes images'") .args_from_usage("-i, --no-images 'Removes images'")
.args_from_usage("-u, --user-agent=<Iceweasel> 'Custom User-Agent string'")
.get_matches(); .get_matches();
// Process the command // Process the command
let arg_target = command.value_of("url").unwrap(); let arg_target = command.value_of("url").unwrap();
let opt_no_js = command.is_present("no-js"); let opt_no_js = command.is_present("no-js");
let opt_no_img = command.is_present("no-images"); let opt_no_img = command.is_present("no-images");
let opt_user_agent = command.value_of("user-agent").unwrap_or(DEFAULT_USER_AGENT);
if is_valid_url(arg_target) { if is_valid_url(arg_target) {
let data = retrieve_asset(&arg_target, false, ""); let data = retrieve_asset(&arg_target, false, "", opt_user_agent);
let dom = html_to_dom(&data.unwrap()); let dom = html_to_dom(&data.unwrap());
walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img); walk_and_embed_assets(&arg_target, &dom.document, opt_no_js, opt_no_img, opt_user_agent);
print_dom(&dom.document); print_dom(&dom.document);
println!(); // Ensure newline at end of output println!(); // Ensure newline at end of output

Loading…
Cancel
Save