Dual Node/Deno package!

+ fix "bad CLI usage" exit code on yargs fail
main
gardenapple 2 years ago
parent 79677040df
commit 247425c6f5
No known key found for this signature in database
GPG Key ID: CAF17E9ABE789268

@ -0,0 +1,557 @@
/*
Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
Copyright (C) 2022 gardenapple
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
export default async function(
Buffer,
fs,
process,
yargs,
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
) {
let errored = false;
function setErrored(exitCode) {
process.exitCode = exitCode;
errored = true;
}
const ExitCodes = {
badUsageCLI: 64,
dataError: 65,
noInput: 66,
noHost: 68,
serviceUnavailable: 69,
noPermission: 77
};
//
//Parsing arguments
//
const Properties = new Map([
["html-title", (article, singleLine, window) =>
`<h1>${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}</h1>`
],
["title", (article, singleLine) =>
singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
],
["excerpt", (article, singleLine) =>
singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt
],
["byline", (article, singleLine) =>
singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline
],
["length", article => article.length],
["dir", article => article.dir],
["text-content", article => article.textContent],
["html-content", (article, _, window) => article.content]
]);
const LowConfidenceMode = {
keep: "keep",
force: "force",
exit: "exit"
};
const readabilityOptions = {};
//backwards compat with old, comma-separated values
function yargsCompatProperties(args) {
if (args["properties"]) {
for (var i = 0; i < args["properties"].length; i++) {
const property = args["properties"][i];
if (property.indexOf(',') > -1) {
const split = args["properties"][i].split(',');
args["properties"].splice(i, 1, ...split);
continue;
}
if (!Properties.has(property)) {
args["properties"].splice(i, 1);
i--;
if (!args["--"])
args["--"] = [ property ];
else
args["--"].push(property);
}
}
}
}
//Positional sometimes don't get recognized when they're put
//after other arguments, I think it's an oversight in yargs.
function yargsFixPositional(args) {
if (args["--"]) {
if (!args["source"])
args["source"] = args["--"].shift();
args["_"] = args["--"];
}
}
let args = yargs
.version(false)
.command("* [source]", __`Process HTML input`, (yargs) => {
yargs.positional("source", {
desc: __`A file, an http(s) URL, or '-' for standard input`,
type: "string"
});
})
.completion("--completion", false)
.middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy
.option("help", {
alias: 'h',
type: "boolean",
desc: __`Show help`
})
.option("completion", {
type: "boolean",
desc: __`Print script for bash/zsh completion`
})
.option("base", {
alias: 'b',
type: "string",
desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)`
})
.option("insane", {
alias: 'S',
type: "boolean",
desc: __`Don't sanitize HTML`
})
.option("insecure", {
alias: 'K',
type: "boolean",
desc: __`Allow invalid SSL certificates`,
hidden: typeof Deno !== "undefined"
})
.option("is-file", {
alias: 'f',
type: "boolean",
desc: __`Interpret SOURCE as a file name rather than a URL`,
default: false,
hidden: true,
//deprecated: true
})
.option("is-url", {
alias: 'U',
type: "boolean",
desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`,
hidden: true,
//deprecated: true
})
.option("json", {
alias: 'j',
type: "boolean",
desc: __`Output properties as a JSON payload`
})
.option("low-confidence", {
alias: 'l',
type: "string",
desc: __`What to do if Readability.js is uncertain about what the core content actually is`,
choices: ["keep", "force", "exit"]
//default: "no-op", //don't set default because completion won't work
})
.option("keep-classes", {
alias: 'C',
type: "boolean",
desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode`
})
.option("output", {
alias: 'o',
type: "string",
desc: __`The file to which the result should be output`
})
.option("properties", {
alias: 'p',
type: "array",
desc: __`Output specific properties of the parsed article`,
choices: Array.from(Properties.keys())
})
.option("proxy", {
alias: 'x',
type: "string",
desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`,
hidden: typeof Deno !== "undefined"
})
.option("quiet", {
alias: 'q',
type: "boolean",
desc: __`Don't output extra information to stderr`
})
.option("style", {
alias: 's',
type: "string",
desc: __`Specify .css file for stylesheet`
})
.option("url", {
alias: 'u',
type: "string",
desc: __`(deprecated) alias for --base`,
hidden: true,
//deprecated: true //completion script does not respect this value, so just say it in the description
})
.option("user-agent", {
alias: 'A',
type: "string",
desc: __`Set custom user agent string`
})
.option("version", {
alias: 'V',
type: "boolean",
desc: __`Print version`
})
.fail((msg, err, yargs) => {
console.error(msg);
setErrored(ExitCodes.badUsageCLI);
})
.epilogue(__`See the manual for more info: man readability-cli`)
.wrap(Math.min(yargs.terminalWidth(), 100))
.strict()
.parse();
if (args["is-url"]) {
console.error(__`Note: --is-url option is deprecated.`);
}
if (args["url"]) {
console.error(__`Note: --url option is deprecated, please use --base instead.`);
args["base"] = args["url"];
}
function printUsage() {
yargs.showHelp();
}
if (args["completion"]) {
yargs.showCompletionScript();
return;
}
if (args["version"]) {
printVersion();
return;
}
if (typeof Deno !== "undefined") {
for (const option of ["insecure", "proxy"]) {
if (args[option]) {
console.error(__`Warning: option --${option} is not supported in Deno.`);
setErrored(ExitCodes.badUsageCLI);
return;
}
}
}
if (args["keep-classes"]) {
readabilityOptions["keepClasses"] = true;
}
if (!args["low-confidence"]) {
args["low-confidence"] = LowConfidenceMode.keep;
args['l'] = LowConfidenceMode.keep;
} else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) {
console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`);
console.error(__`Use --help for more info.`);
setErrored(ExitCodes.badUsageCLI);
return;
}
let inputArg;
if (!args["source"]) {
if (process.stdin.isTTY) {
console.error(__`No input provided`);
printUsage();
setErrored(ExitCodes.badUsageCLI);
return;
} else {
inputArg = '-'
}
} else {
inputArg = args["source"];
}
//Get input parameter, remove inputArg from args
let inputFile;
let inputURL;
let inputIsFromStdin = false;
if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1)
inputArg = "https://" + inputArg;
if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1)
inputURL = inputArg;
else if (inputArg == '-')
inputIsFromStdin = true;
else
inputFile = inputArg;
const outputArg = args["output"];
const documentURL = args["base"] || inputURL;
const outputJSON = args["json"];
let proxy = args["proxy"];
if (!proxy && typeof Deno === "undefined")
proxy = process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy;
let wantedProperties;
if (args["properties"]) {
wantedProperties = args["properties"];
}
if (errored) {
printUsage();
return;
}
async function read(stream) {
const chunks = [];
for await (const chunk of stream){
chunks.push(chunk);
}
return Buffer.concat(chunks).toString("utf8");
}
let document, window
try {
if (inputIsFromStdin) {
if (!args["quiet"]) {
console.error("Reading...");
if (!documentURL)
console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
}
const input = await read(process.stdin);
[document, window] = await parseDOM(result, documentURL);
} else {
if (!args["quiet"])
console.error(__`Retrieving...`);
let parseDOMPromise;
if (inputURL) {
parseDOMPromise = parseDOMFromURL(documentURL, proxy, !args["insecure"], args["user-agent"]);
} else if (inputFile) {
parseDOMPromise = parseDOMFromFile(inputFile, documentURL);
}
[document, window] = await parseDOMPromise;
}
} catch (error) {
if (error.error) {
//Nested error?
error = error.error;
}
if (error instanceof TypeError && inputURL) {
console.error(__`Invalid URL: ${inputURL}`);
setErrored(ExitCodes.badUsageCLI);
} else if (error.code == "ENOENT") {
console.error(error.message);
setErrored(ExitCodes.noInput);
} else if (error.code == "EACCES") {
console.error(error.message);
setErrored(ExitCodes.noPermission);
} else if (error.code == "ENOTFOUND") {
console.error(__`Host not found: ${error.hostname}`);
setErrored(ExitCodes.noHost);
} else if (error.statusCode) {
console.error(__`Status error: ${error.response.statusMessage}`);
setErrored(ExitCodes.serviceUnavailable);
} else {
console.error(error.message);
//console.error(error);
setErrored(ExitCodes.serviceUnavailable);
}
return;
}
//Taken from https://stackoverflow.com/a/22706073/5701177
function escapeHTML(string, document) {
var p = document.createElement("p");
p.appendChild(document.createTextNode(string));
return p.innerHTML;
}
async function getHTML(document, window) {
if (args["insane"])
return document.documentElement.outerHTML;
else
return await sanitizeDOM(document, window);
}
let shouldParseArticle = true;
if (args["low-confidence"] != LowConfidenceMode.force)
shouldParseArticle = isProbablyReaderable(document);
if (!shouldParseArticle) {
if (args["low-confidence"] == LowConfidenceMode.exit) {
console.error(__`Not sure if this document should be processed, exiting`);
setErrored(ExitCodes.dataError);
return;
} else {
if (!args["quiet"])
console.error(__`Not sure if this document should be processed. Not processing`);
if (args["json"] || wantedProperties) {
console.error(__`Can't output properties`);
setErrored(ExitCodes.dataError);
return;
}
shouldParseArticle = false;
}
}
let writeStream;
if (outputArg) {
writeStream = fs.createWriteStream(outputArg);
} else {
writeStream = process.stdout;
}
if (!shouldParseArticle) {
//Ignore wantedProperties, that should've thrown an error before
writeStream.write(await getHTML(document, window));
return;
}
if (!args["quiet"])
console.error(__`Processing...`);
const reader = new Readability(document, readabilityOptions);
const article = reader.parse();
if (!article) {
if (args["low-confidence"] == LowConfidenceMode.keep) {
if (!args["quiet"])
console.error(__`Couldn't process document.`);
writeStream.write(await getHTML(document, window));
} else {
console.error(__`Couldn't process document.`);
setErrored(ExitCodes.dataError);
}
return;
}
if (outputJSON) {
let result = {};
if (wantedProperties) {
for (propertyName of wantedProperties)
result[propertyName] = Properties.get(propertyName)(article, false, window);
} else {
for (const [name, func] of Properties) {
result[name] = func(article, false, window);
}
}
writeStream.write(JSON.stringify(result));
} else {
if (wantedProperties) {
for (propertyName of wantedProperties)
writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
} else {
writeStream.write(`<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">`);
if (args["style"] || !args["keep-classes"]) {
const cssHref = args["style"] || "chrome://global/skin/aboutReader.css";
writeStream.write(`
<link rel="stylesheet" href="${cssHref}" type="text/css">`);
}
writeStream.write(`
<title>${escapeHTML(Properties.get("title")(article, false, window), document)}</title>
</head>
`
);
if (!args["keep-classes"]) {
//Add a few divs and classes so that Firefox Reader Mode CSS works well
writeStream.write(`
<body class="light sans-serif loaded" style="--font-size:14pt; --content-width:40em;">
<div class="container" `
);
const contentDir = Properties.get("dir")(article, false, window);
if (contentDir)
writeStream.write(`dir="${contentDir}">`);
else
writeStream.write('>');
writeStream.write(`
<div class="header reader-header reader-show-element">
<h1 class="reader-title">${escapeHTML(Properties.get("title")(article, false, window), document)}</h1>`);
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`
<div class="credits reader-credits">${escapeHTML(author, document)}</div>`);
}
writeStream.write(`
</div>
<hr>
<div class="content">
<div class="moz-reader-content reader-show-element">
`
);
const html = Properties.get("html-content")(article, false, window);
if (!args["insane"])
writeStream.write(await sanitizeHTML(html, window));
else
writeStream.write(html);
writeStream.write(`
</div>
</div>
</div>
`
);
} else {
writeStream.write("\n<body>\n");
writeStream.write(Properties.get("html-title")(article, false, window));
writeStream.write('\n');
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`<p><i>${escapeHTML(author, document)}</i></p>`);
}
writeStream.write("\n<hr>\n");
const html = Properties.get("html-content")(article, false, window);
if (!args["insane"])
writeStream.write(await sanitizeHTML(html, window));
else
writeStream.write(html);
}
writeStream.write("\n</body></html>");
}
}
}

@ -4,7 +4,7 @@
Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
Copyright (C) 2021 gardenapple Copyright (C) 2022 gardenapple
This program is free software: you can redistribute it and/or modify This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by it under the terms of the GNU General Public License as published by
@ -20,7 +20,12 @@ Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
along with this program. If not, see <https://www.gnu.org/licenses/>. along with this program. If not, see <https://www.gnu.org/licenses/>.
*/ */
const fs = require("fs");
const path = require("path"); const path = require("path");
const process = require("process");
const yargs = require("yargs");
const y18n = require("y18n");
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs: // GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = ( const locale = (
@ -31,541 +36,79 @@ const locale = (
"en_US" "en_US"
).replace(/[.:].*/, ''); ).replace(/[.:].*/, '');
const yargs = require("yargs"); const __ = y18n({
const __ = require("y18n")({
locale: locale, locale: locale,
updateFiles: false, updateFiles: false,
directory: path.resolve(__dirname, "locales") directory: path.resolve(__dirname, "./locales")
}).__; }).__;
//JSDOM, fs, Readability, and Readability-readerable are loaded on-demand. const { Readability, isProbablyReaderable } = require("@mozilla/readability");
const ExitCodes = {
badUsageCLI: 64,
dataError: 65,
noInput: 66,
noHost: 68,
serviceUnavailable: 69,
noPermission: 77
};
let errored = false;
function setErrored(exitCode) {
process.exitCode = exitCode;
errored = true;
}
//
//Parsing arguments
//
const Properties = new Map([
["html-title", (article, singleLine, window) =>
`<h1>${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}</h1>`
],
["title", (article, singleLine) =>
singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
],
["excerpt", (article, singleLine) =>
singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt
],
["byline", (article, singleLine) =>
singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline
],
["length", article => article.length],
["dir", article => article.dir],
["text-content", article => article.textContent],
["html-content", (article, _, window) => {
if (!args["insane"]) {
const createDOMPurify = require("dompurify");
const DOMPurify = createDOMPurify(window);
return DOMPurify.sanitize(article.content);
}
return article.content;
}]
]);
const LowConfidenceMode = {
keep: "keep",
force: "force",
exit: "exit"
};
const readabilityOptions = {};
//backwards compat with old, comma-separated values
function yargsCompatProperties(args) {
if (args["properties"]) {
for (var i = 0; i < args["properties"].length; i++) {
const property = args["properties"][i];
if (property.indexOf(',') > -1) {
const split = args["properties"][i].split(',');
args["properties"].splice(i, 1, ...split);
continue;
}
if (!Properties.has(property)) {
args["properties"].splice(i, 1);
i--;
if (!args["--"])
args["--"] = [ property ];
else
args["--"].push(property);
}
}
}
}
//Positional sometimes don't get recognized when they're put
//after other arguments, I think it's an oversight in yargs.
function yargsFixPositional(args) {
if (args["--"]) {
if (!args["source"])
args["source"] = args["--"].shift();
args["_"] = args["--"];
}
}
let args = yargs
.version(false)
.command("* [source]", __`Process HTML input`, (yargs) => {
yargs.positional("source", {
desc: __`A file, an http(s) URL, or '-' for standard input`,
type: "string"
});
})
.completion('--completion', false)
.middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy
.option("help", {
alias: 'h',
type: "boolean",
desc: __`Show help`
})
.option("completion", {
type: "boolean",
desc: __`Print script for bash/zsh completion`
})
.option("base", {
alias: 'b',
type: "string",
desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)`
})
.option("insane", {
alias: 'S',
type: "boolean",
desc: __`Don't sanitize HTML`
})
.option("insecure", {
alias: 'K',
type: "boolean",
desc: __`Allow invalid SSL certificates`
})
.option("is-file", {
alias: 'f',
type: "boolean",
desc: __`Interpret SOURCE as a file name rather than a URL`,
default: false,
hidden: true,
//deprecated: true
})
.option("is-url", {
alias: 'U',
type: "boolean",
desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`,
hidden: true,
//deprecated: true
})
.option("json", {
alias: 'j',
type: "boolean",
desc: __`Output properties as a JSON payload`
})
.option("low-confidence", {
alias: 'l',
type: "string",
desc: __`What to do if Readability.js is uncertain about what the core content actually is`,
choices: ["keep", "force", "exit"]
//default: "no-op", //don't set default because completion won't work
})
.option("keep-classes", {
alias: 'C',
type: "boolean",
desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode`
})
.option("output", {
alias: 'o',
type: "string",
desc: __`The file to which the result should be output`
})
.option("properties", {
alias: 'p',
type: "array",
desc: __`Output specific properties of the parsed article`,
choices: Array.from(Properties.keys())
})
.option("proxy", {
alias: 'x',
type: "string",
desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`
})
.option("quiet", {
alias: 'q',
type: "boolean",
desc: __`Don't output extra information to stderr`
})
.option("style", {
alias: 's',
type: "string",
desc: __`Specify .css file for stylesheet`
})
.option("url", {
alias: 'u',
type: "string",
desc: __`(deprecated) alias for --base`,
hidden: true,
//deprecated: true //completion script does not respect this value, so just say it in the description
})
.option("user-agent", {
alias: 'A',
type: "string",
desc: __`Set custom user agent string`
})
.option("version", {
alias: 'V',
type: "boolean",
desc: __`Print version`
})
.epilogue(__`See the manual for more info: man readability-cli`)
.wrap(Math.min(yargs.terminalWidth(), 100))
.strict()
.parse();
if (args["is-url"]) {
console.error(__`Note: --is-url option is deprecated.`);
}
if (args["url"]) {
console.error(__`Note: --url option is deprecated, please use --base instead.`);
args["base"] = args["url"];
}
function printUsage() {
yargs.showHelp();
}
if (args["completion"]) {
yargs.showCompletionScript();
return;
}
if (args["version"]) { function printVersion() {
console.log(`readability-cli v${require("./package.json").version}`); console.log(`readability-cli v${require("./package.json").version}`);
console.log(`Node.js ${process.version}`); console.log(`Node.js ${process.version}`);
return;
}
if (args["keep-classes"]) {
readabilityOptions["keepClasses"] = true;
}
if (!args["low-confidence"]) {
args["low-confidence"] = LowConfidenceMode.keep;
args['l'] = LowConfidenceMode.keep;
} else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) {
console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`);
console.error(__`Use --help for more info.`);
setErrored(ExitCodes.badUsageCLI);
return;
} }
async function parseDOMFromURL(url, proxy, strictSSL, userAgent) {
const { JSDOM, ResourceLoader } = require("jsdom");
const resourceLoader = new ResourceLoader({
proxy: proxy,
strictSSL: strictSSL,
userAgent: userAgent
});
let inputArg; const dom = await JSDOM.fromURL(url, {
if (!args["source"]) { resources: resourceLoader
if (process.stdin.isTTY) { })
console.error(__`No input provided`); return [dom.window.document, dom.window];
printUsage();
setErrored(ExitCodes.badUsageCLI);
return;
} else {
inputArg = '-'
}
} else {
inputArg = args["source"];
} }
//Get input parameter, remove inputArg from args async function parseDOM(html, url) {
let inputFile; const { JSDOM } = require("jsdom");
let inputURL; const dom = new JSDOM(html, { url: url });
let inputIsFromStdin = false; return [dom.window.document, dom.window];
if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1)
inputArg = "https://" + inputArg;
if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1)
inputURL = inputArg;
else if (inputArg == '-')
inputIsFromStdin = true;
else
inputFile = inputArg;
const outputArg = args["output"];
const documentURL = args["base"] || inputURL;
const outputJSON = args["json"];
const proxy = args["proxy"] || process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy;
let wantedProperties;
if (args["properties"]) {
wantedProperties = args["properties"];
} }
async function parseDOMFromFile(file, url) {
if (errored) { const { JSDOM } = require("jsdom");
printUsage(); const dom = await JSDOM.fromFile(file, {
return; url: url,
// workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9
contentType: "text/html; charset=utf-8"
})
return [dom.window.document, dom.window];
} }
async function read(stream) { async function sanitizeHTML(html, window) {
const chunks = []; const createDOMPurify = require("dompurify");
for await (const chunk of stream){ const DOMPurify = createDOMPurify(window);
chunks.push(chunk); return DOMPurify.sanitize(html);
}
return Buffer.concat(chunks).toString("utf8");
} }
async function sanitizeDOM(document, window) {
const createDOMPurify = require("dompurify");
if (inputIsFromStdin) { const DOMPurify = createDOMPurify(window);
if (!args["quiet"]) { DOMPurify.sanitize(document, {IN_PLACE: true, WHOLE_DOCUMENT: true});
console.error("Reading..."); return document.documentElement.outerHTML;
if (!documentURL)
console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
}
read(process.stdin).then(result => {
const JSDOM = require("jsdom").JSDOM;
onLoadDOM(new JSDOM(result, { url: documentURL }));
});
} else {
if (!args["quiet"])
console.error(__`Retrieving...`);
const jsdom = require("jsdom");
let promiseGetHTML;
if (inputURL) {
const resourceLoader = new jsdom.ResourceLoader({
proxy: proxy,
strictSSL: !args["insecure"],
userAgent: args["user-agent"]
});
promiseGetHTML = jsdom.JSDOM.fromURL(inputURL, {
resources: resourceLoader
});
} else if (inputFile) {
promiseGetHTML = jsdom.JSDOM.fromFile(inputFile, {
url: documentURL,
// workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9
contentType: "text/html; charset=utf-8"
});
}
promiseGetHTML.then(onLoadDOM, onLoadDOMError);
} }
import("./common.mjs").then((module) => {
const { Readability, isProbablyReaderable } = require("@mozilla/readability"); const readable = module.default;
readable(
//Taken from https://stackoverflow.com/a/22706073/5701177 Buffer,
function escapeHTML(string, document) { fs,
var p = document.createElement("p"); process,
p.appendChild(document.createTextNode(string)); yargs,
return p.innerHTML; __,
} Readability,
isProbablyReaderable,
function onLoadDOM(dom) { printVersion,
const window = dom.window parseDOM,
parseDOMFromFile,
let shouldParseArticle = true; parseDOMFromURL,
if (args["low-confidence"] != LowConfidenceMode.force) sanitizeDOM,
shouldParseArticle = isProbablyReaderable(window.document); sanitizeHTML
);
if (!shouldParseArticle) { });
if (args["low-confidence"] == LowConfidenceMode.exit) {
console.error(__`Not sure if this document should be processed, exiting`);
setErrored(ExitCodes.dataError);
return;
} else {
if (!args["quiet"])
console.error(__`Not sure if this document should be processed. Not processing`);
if (args["json"] || wantedProperties) {
console.error(__`Can't output properties`);
setErrored(ExitCodes.dataError);
return;
}
shouldParseArticle = false;
}
}
let writeStream;
if (outputArg) {
const fs = require("fs");
writeStream = fs.createWriteStream(outputArg);
} else {
writeStream = process.stdout;
}
if (!shouldParseArticle) {
//Ignore wantedProperties, that should've thrown an error before
writeStream.write(getHTML(window));
return;
}
if (!args["quiet"])
console.error(__`Processing...`);
const reader = new Readability(window.document, readabilityOptions);
const article = reader.parse();
if (!article) {
if (args["low-confidence"] == LowConfidenceMode.keep) {
if (!args["quiet"])
console.error(__`Couldn't process document.`);
writeStream.write(getHTML(window));
} else {
console.error(__`Couldn't process document.`);
setErrored(ExitCodes.dataError);
}
return;
}
if (outputJSON) {
let result = {};
if (wantedProperties) {
for (propertyName of wantedProperties)
result[propertyName] = Properties.get(propertyName)(article, false, window);
} else {
for (const [name, func] of Properties) {
result[name] = func(article, false, window);
}
}
writeStream.write(JSON.stringify(result));
} else {
if (wantedProperties) {
for (propertyName of wantedProperties)
writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
} else {
writeStream.write(`<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">`);
if (args["style"] || !args["keep-classes"]) {
const cssHref = args["style"] || "chrome://global/skin/aboutReader.css";
writeStream.write(`
<link rel="stylesheet" href="${cssHref}" type="text/css">`);
}
writeStream.write(`
<title>${escapeHTML(Properties.get("title")(article, false, window), window.document)}</title>
</head>
`
);
if (!args["keep-classes"]) {
//Add a few divs and classes so that Firefox Reader Mode CSS works well
writeStream.write(`
<body class="light sans-serif loaded" style="--font-size:14pt; --content-width:40em;">
<div class="container" `
);
const contentDir = Properties.get("dir")(article, false, window);
if (contentDir)
writeStream.write(`dir="${contentDir}">`);
else
writeStream.write('>');
writeStream.write(`
<div class="header reader-header reader-show-element">
<h1 class="reader-title">${escapeHTML(Properties.get("title")(article, false, window), window.document)}</h1>`);
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`
<div class="credits reader-credits">${escapeHTML(author, window.document)}</div>`);
}
writeStream.write(`
</div>
<hr>
<div class="content">
<div class="moz-reader-content reader-show-element">
`
);
writeStream.write(Properties.get("html-content")(article, false, window));
writeStream.write(`
</div>
</div>
</div>
`
);
} else {
writeStream.write("\n<body>\n");
writeStream.write(Properties.get("html-title")(article, false, window));
writeStream.write('\n');
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`<p><i>${escapeHTML(author, window.document)}</i></p>`);
}
writeStream.write("\n<hr>\n");
writeStream.write(Properties.get("html-content")(article, false, window));
}
writeStream.write("\n</body></html>");
}
}
}
function onLoadDOMError(error) {
if (error.error) {
//Nested error?
error = error.error;
}
if (error instanceof TypeError && inputURL) {
console.error(__`Invalid URL: ${inputURL}`);
setErrored(ExitCodes.badUsageCLI);
} else if (error.code == "ENOENT") {
console.error(error.message);
setErrored(ExitCodes.noInput);
} else if (error.code == "EACCES") {
console.error(error.message);
setErrored(ExitCodes.noPermission);
} else if (error.code == "ENOTFOUND") {
console.error(__`Host not found: ${error.hostname}`);
setErrored(ExitCodes.noHost);
} else if (error.statusCode) {
console.error(__`Status error: ${error.response.statusMessage}`);
setErrored(ExitCodes.serviceUnavailable);
} else {
console.error(error.message);
//console.error(error);
setErrored(ExitCodes.serviceUnavailable);
}
}
function getHTML(window) {
let html = window.document.documentElement.outerHTML;
if (!args["insane"]) {
const createDOMPurify = require("dompurify");
const DOMPurify = createDOMPurify(window);
return DOMPurify.sanitize(html, {IN_PLACE: true, WHOLE_DOCUMENT: true});
}
return html;
}

@ -34,5 +34,6 @@
"Host not found: '%s'": "Host not found: '%s'", "Host not found: '%s'": "Host not found: '%s'",
"Unknown mode: %s\nPlease use one of: keep, force, exit": "Unknown mode: %s\nPlease use one of: keep, force, exit", "Unknown mode: %s\nPlease use one of: keep, force, exit": "Unknown mode: %s\nPlease use one of: keep, force, exit",
"Use --help for more info.": "Use --help for more info.", "Use --help for more info.": "Use --help for more info.",
"See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli" "See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli",
"Error: option --%s is not supported in Deno.": "Error: option --%s is not supported in Deno."
} }

@ -34,5 +34,6 @@
"Host not found: '%s'": "Сервер не найден: '%s'", "Host not found: '%s'": "Сервер не найден: '%s'",
"Unknown mode: %s\nPlease use one of: keep, force, exit": "Неизвестный режим: %s\nПожалуйста, используйте один из: keep, force, exit", "Unknown mode: %s\nPlease use one of: keep, force, exit": "Неизвестный режим: %s\nПожалуйста, используйте один из: keep, force, exit",
"Use --help for more info.": "Чтобы узнать больше, воспользуйтесь --help", "Use --help for more info.": "Чтобы узнать больше, воспользуйтесь --help",
"See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli" "See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli",
"Error: option --%s is not supported in Deno.": "Ошибка: параметр --%s не поддерживается в Deno."
} }

14
package-lock.json generated

@ -13,7 +13,7 @@
"dompurify": "^2.3.4", "dompurify": "^2.3.4",
"jsdom": "^19.0.0", "jsdom": "^19.0.0",
"y18n": "^5.0.8", "y18n": "^5.0.8",
"yargs": "^17.3.0" "yargs": "^17.4.0"
}, },
"bin": { "bin": {
"readable": "index.js" "readable": "index.js"
@ -776,9 +776,9 @@
} }
}, },
"node_modules/yargs": { "node_modules/yargs": {
"version": "17.3.0", "version": "17.4.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz", "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz",
"integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==", "integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==",
"dependencies": { "dependencies": {
"cliui": "^7.0.2", "cliui": "^7.0.2",
"escalade": "^3.1.1", "escalade": "^3.1.1",
@ -1344,9 +1344,9 @@
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==" "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA=="
}, },
"yargs": { "yargs": {
"version": "17.3.0", "version": "17.4.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz", "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz",
"integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==", "integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==",
"requires": { "requires": {
"cliui": "^7.0.2", "cliui": "^7.0.2",
"escalade": "^3.1.1", "escalade": "^3.1.1",

@ -32,7 +32,7 @@
"dompurify": "^2.3.4", "dompurify": "^2.3.4",
"jsdom": "^19.0.0", "jsdom": "^19.0.0",
"y18n": "^5.0.8", "y18n": "^5.0.8",
"yargs": "^17.3.0" "yargs": "^17.4.0"
}, },
"devDependencies": { "devDependencies": {
"marked-man": "^0.7.0" "marked-man": "^0.7.0"

@ -28,7 +28,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input.
`-K`, `--insecure` `-K`, `--insecure`
* Allow invalid SSL certificates. * (Node.js version only) Allow invalid SSL certificates.
`-j`, `--json` `-j`, `--json`
@ -58,7 +58,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input.
`-x`, `--proxy` *URL* `-x`, `--proxy` *URL*
* Use specified proxy (can also use `HTTPS_PROXY` environment variable). * (Node.js version only) Use specified proxy. Node.js and Deno can also use `HTTPS_PROXY` environment variable.
`-q`, `--quiet` `-q`, `--quiet`
@ -114,7 +114,7 @@ As usual, exit code 0 indicates success, and anything other than 0 is an error.
**readability-cli** supports localization, using the environment variables `LC_ALL`, `LC_MESSAGES`, `LANG` and `LANGUAGE`, in that order. Only one language at a time is supported. **readability-cli** supports localization, using the environment variables `LC_ALL`, `LC_MESSAGES`, `LANG` and `LANGUAGE`, in that order. Only one language at a time is supported.
`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Lowercase `https_proxy` and `http_proxy` are also recognized. `HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Node.js also recognizes lowercase `https_proxy` and `http_proxy`, for compatibility with `curl`.
## EXAMPLE ## EXAMPLE

@ -0,0 +1,131 @@
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write --allow-env=HTTPS_PROXY,LC_ALL,LC_MESSAGES,LANG,LANGUAGE --no-prompt --no-check
const version = "2.3.5"
import * as path from "https://deno.land/std@0.130.0/path/mod.ts"
import { Buffer } from "https://deno.land/std@0.130.0/node/buffer.ts"
import fs from "https://deno.land/std@0.130.0/node/fs.ts"
import process from "https://deno.land/std@0.130.0/node/process.ts"
import yargs from "https://deno.land/x/yargs@v17.4.0-deno/deno.ts"
import y18n from "https://deno.land/x/y18n@v5.0.8-deno/deno.ts"
import { initParser, DOMParser, DOMParserMimeType, Document, Element } from "https://deno.land/x/deno_dom@v0.1.21-alpha/deno-dom-wasm-noinit.ts"
import * as ammonia from "https://deno.land/x/ammonia@0.3.1/mod.ts"
import { Readability, isProbablyReaderable } from "https://esm.sh/@mozilla/readability@0.4.1?no-check"
import UserAgent from "https://esm.sh/user-agents@1.0.963"
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = (
Deno.env.get("LC_ALL") ||
Deno.env.get("LC_MESSAGES") ||
Deno.env.get("LANG") ||
Deno.env.get("LANGUAGE") ||
"en_US"
).replace(/[.:].*/, '')
const __ = y18n({
locale: locale,
updateFiles: false,
directory: path.join(path.dirname(path.fromFileUrl(import.meta.url)), "locales")
}).__
function printVersion() {
console.log(`readability-cli v${version}`)
console.log(`Deno ${Deno.version.deno}`)
}
async function parseDOMFromURL(url: string, proxy?: string, strictSSL?: boolean, userAgent?: string) {
const initParserPromise = initParser()
const userAgentString = userAgent ?? new UserAgent({ deviceCategory: "desktop" }).toString()
const response = await fetch(url, {
headers: {
"User-Agent": userAgentString
}
})
if (!response.ok) {
throw {
statusCode: response.status,
response: {
statusMessage: response.statusText
}
}
}
const text = await response.text()
await initParserPromise
const contentType = response.headers.get("Content-Type")!
let mimeType = contentType.slice(0, contentType.indexOf(';'))
if (mimeType == "text/htm")
mimeType = "text/html"
return parseDOM(text, url, mimeType as DOMParserMimeType)
}
async function parseDOM(html: string, url?: string, mimeType?: DOMParserMimeType) {
await initParser()
const document = new DOMParser().parseFromString(html, mimeType ?? "text/html")!!
const baseURLString = document.getElementsByTagName("base")[0]?.getAttribute("href") ?? url
if (baseURLString) {
const baseURL = new URL(baseURLString)
const nodes: Element[] = []
nodes.push(document.documentElement!!)
while (nodes.length > 0) {
const element = nodes.pop()!!
const href = element.getAttribute("href")
if (href) {
try {
// Try to parse absolute URL
new URL(href)
} catch (e) {
// Assume href is a relative URL
element.setAttribute("href", new URL(href, baseURL))
}
}
nodes.push(...element.children)
}
}
return [document]
}
async function parseDOMFromFile(file: string, url: string) {
const data = await Deno.readFile(file)
return parseDOM(new TextDecoder().decode(data), url)
}
async function sanitizeHTML(html: string) {
await ammonia.init()
return ammonia.clean(html)
}
async function sanitizeDOM(document: Document) {
return sanitizeHTML(document.documentElement!.outerHTML)
}
import readable from "./common.mjs"
await readable(
Buffer,
fs,
process,
yargs(Deno.args),
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
)
if (process.exitCode) {
process.exit()
}
Loading…
Cancel
Save