Dual Node/Deno package!

+ fix "bad CLI usage" exit code on yargs fail
main
gardenapple 2 years ago
parent 79677040df
commit 247425c6f5
No known key found for this signature in database
GPG Key ID: CAF17E9ABE789268

@ -0,0 +1,557 @@
/*
Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
Copyright (C) 2022 gardenapple
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
export default async function(
Buffer,
fs,
process,
yargs,
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
) {
let errored = false;
function setErrored(exitCode) {
process.exitCode = exitCode;
errored = true;
}
const ExitCodes = {
badUsageCLI: 64,
dataError: 65,
noInput: 66,
noHost: 68,
serviceUnavailable: 69,
noPermission: 77
};
//
//Parsing arguments
//
const Properties = new Map([
["html-title", (article, singleLine, window) =>
`<h1>${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}</h1>`
],
["title", (article, singleLine) =>
singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
],
["excerpt", (article, singleLine) =>
singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt
],
["byline", (article, singleLine) =>
singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline
],
["length", article => article.length],
["dir", article => article.dir],
["text-content", article => article.textContent],
["html-content", (article, _, window) => article.content]
]);
const LowConfidenceMode = {
keep: "keep",
force: "force",
exit: "exit"
};
const readabilityOptions = {};
//backwards compat with old, comma-separated values
function yargsCompatProperties(args) {
if (args["properties"]) {
for (var i = 0; i < args["properties"].length; i++) {
const property = args["properties"][i];
if (property.indexOf(',') > -1) {
const split = args["properties"][i].split(',');
args["properties"].splice(i, 1, ...split);
continue;
}
if (!Properties.has(property)) {
args["properties"].splice(i, 1);
i--;
if (!args["--"])
args["--"] = [ property ];
else
args["--"].push(property);
}
}
}
}
//Positional sometimes don't get recognized when they're put
//after other arguments, I think it's an oversight in yargs.
function yargsFixPositional(args) {
if (args["--"]) {
if (!args["source"])
args["source"] = args["--"].shift();
args["_"] = args["--"];
}
}
let args = yargs
.version(false)
.command("* [source]", __`Process HTML input`, (yargs) => {
yargs.positional("source", {
desc: __`A file, an http(s) URL, or '-' for standard input`,
type: "string"
});
})
.completion("--completion", false)
.middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy
.option("help", {
alias: 'h',
type: "boolean",
desc: __`Show help`
})
.option("completion", {
type: "boolean",
desc: __`Print script for bash/zsh completion`
})
.option("base", {
alias: 'b',
type: "string",
desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)`
})
.option("insane", {
alias: 'S',
type: "boolean",
desc: __`Don't sanitize HTML`
})
.option("insecure", {
alias: 'K',
type: "boolean",
desc: __`Allow invalid SSL certificates`,
hidden: typeof Deno !== "undefined"
})
.option("is-file", {
alias: 'f',
type: "boolean",
desc: __`Interpret SOURCE as a file name rather than a URL`,
default: false,
hidden: true,
//deprecated: true
})
.option("is-url", {
alias: 'U',
type: "boolean",
desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`,
hidden: true,
//deprecated: true
})
.option("json", {
alias: 'j',
type: "boolean",
desc: __`Output properties as a JSON payload`
})
.option("low-confidence", {
alias: 'l',
type: "string",
desc: __`What to do if Readability.js is uncertain about what the core content actually is`,
choices: ["keep", "force", "exit"]
//default: "no-op", //don't set default because completion won't work
})
.option("keep-classes", {
alias: 'C',
type: "boolean",
desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode`
})
.option("output", {
alias: 'o',
type: "string",
desc: __`The file to which the result should be output`
})
.option("properties", {
alias: 'p',
type: "array",
desc: __`Output specific properties of the parsed article`,
choices: Array.from(Properties.keys())
})
.option("proxy", {
alias: 'x',
type: "string",
desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`,
hidden: typeof Deno !== "undefined"
})
.option("quiet", {
alias: 'q',
type: "boolean",
desc: __`Don't output extra information to stderr`
})
.option("style", {
alias: 's',
type: "string",
desc: __`Specify .css file for stylesheet`
})
.option("url", {
alias: 'u',
type: "string",
desc: __`(deprecated) alias for --base`,
hidden: true,
//deprecated: true //completion script does not respect this value, so just say it in the description
})
.option("user-agent", {
alias: 'A',
type: "string",
desc: __`Set custom user agent string`
})
.option("version", {
alias: 'V',
type: "boolean",
desc: __`Print version`
})
.fail((msg, err, yargs) => {
console.error(msg);
setErrored(ExitCodes.badUsageCLI);
})
.epilogue(__`See the manual for more info: man readability-cli`)
.wrap(Math.min(yargs.terminalWidth(), 100))
.strict()
.parse();
if (args["is-url"]) {
console.error(__`Note: --is-url option is deprecated.`);
}
if (args["url"]) {
console.error(__`Note: --url option is deprecated, please use --base instead.`);
args["base"] = args["url"];
}
function printUsage() {
yargs.showHelp();
}
if (args["completion"]) {
yargs.showCompletionScript();
return;
}
if (args["version"]) {
printVersion();
return;
}
if (typeof Deno !== "undefined") {
for (const option of ["insecure", "proxy"]) {
if (args[option]) {
console.error(__`Warning: option --${option} is not supported in Deno.`);
setErrored(ExitCodes.badUsageCLI);
return;
}
}
}
if (args["keep-classes"]) {
readabilityOptions["keepClasses"] = true;
}
if (!args["low-confidence"]) {
args["low-confidence"] = LowConfidenceMode.keep;
args['l'] = LowConfidenceMode.keep;
} else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) {
console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`);
console.error(__`Use --help for more info.`);
setErrored(ExitCodes.badUsageCLI);
return;
}
let inputArg;
if (!args["source"]) {
if (process.stdin.isTTY) {
console.error(__`No input provided`);
printUsage();
setErrored(ExitCodes.badUsageCLI);
return;
} else {
inputArg = '-'
}
} else {
inputArg = args["source"];
}
//Get input parameter, remove inputArg from args
let inputFile;
let inputURL;
let inputIsFromStdin = false;
if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1)
inputArg = "https://" + inputArg;
if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1)
inputURL = inputArg;
else if (inputArg == '-')
inputIsFromStdin = true;
else
inputFile = inputArg;
const outputArg = args["output"];
const documentURL = args["base"] || inputURL;
const outputJSON = args["json"];
let proxy = args["proxy"];
if (!proxy && typeof Deno === "undefined")
proxy = process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy;
let wantedProperties;
if (args["properties"]) {
wantedProperties = args["properties"];
}
if (errored) {
printUsage();
return;
}
async function read(stream) {
const chunks = [];
for await (const chunk of stream){
chunks.push(chunk);
}
return Buffer.concat(chunks).toString("utf8");
}
let document, window
try {
if (inputIsFromStdin) {
if (!args["quiet"]) {
console.error("Reading...");
if (!documentURL)
console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
}
const input = await read(process.stdin);
[document, window] = await parseDOM(result, documentURL);
} else {
if (!args["quiet"])
console.error(__`Retrieving...`);
let parseDOMPromise;
if (inputURL) {
parseDOMPromise = parseDOMFromURL(documentURL, proxy, !args["insecure"], args["user-agent"]);
} else if (inputFile) {
parseDOMPromise = parseDOMFromFile(inputFile, documentURL);
}
[document, window] = await parseDOMPromise;
}
} catch (error) {
if (error.error) {
//Nested error?
error = error.error;
}
if (error instanceof TypeError && inputURL) {
console.error(__`Invalid URL: ${inputURL}`);
setErrored(ExitCodes.badUsageCLI);
} else if (error.code == "ENOENT") {
console.error(error.message);
setErrored(ExitCodes.noInput);
} else if (error.code == "EACCES") {
console.error(error.message);
setErrored(ExitCodes.noPermission);
} else if (error.code == "ENOTFOUND") {
console.error(__`Host not found: ${error.hostname}`);
setErrored(ExitCodes.noHost);
} else if (error.statusCode) {
console.error(__`Status error: ${error.response.statusMessage}`);
setErrored(ExitCodes.serviceUnavailable);
} else {
console.error(error.message);
//console.error(error);
setErrored(ExitCodes.serviceUnavailable);
}
return;
}
//Taken from https://stackoverflow.com/a/22706073/5701177
function escapeHTML(string, document) {
var p = document.createElement("p");
p.appendChild(document.createTextNode(string));
return p.innerHTML;
}
async function getHTML(document, window) {
if (args["insane"])
return document.documentElement.outerHTML;
else
return await sanitizeDOM(document, window);
}
let shouldParseArticle = true;
if (args["low-confidence"] != LowConfidenceMode.force)
shouldParseArticle = isProbablyReaderable(document);
if (!shouldParseArticle) {
if (args["low-confidence"] == LowConfidenceMode.exit) {
console.error(__`Not sure if this document should be processed, exiting`);
setErrored(ExitCodes.dataError);
return;
} else {
if (!args["quiet"])
console.error(__`Not sure if this document should be processed. Not processing`);
if (args["json"] || wantedProperties) {
console.error(__`Can't output properties`);
setErrored(ExitCodes.dataError);
return;
}
shouldParseArticle = false;
}
}
let writeStream;
if (outputArg) {
writeStream = fs.createWriteStream(outputArg);
} else {
writeStream = process.stdout;
}
if (!shouldParseArticle) {
//Ignore wantedProperties, that should've thrown an error before
writeStream.write(await getHTML(document, window));
return;
}
if (!args["quiet"])
console.error(__`Processing...`);
const reader = new Readability(document, readabilityOptions);
const article = reader.parse();
if (!article) {
if (args["low-confidence"] == LowConfidenceMode.keep) {
if (!args["quiet"])
console.error(__`Couldn't process document.`);
writeStream.write(await getHTML(document, window));
} else {
console.error(__`Couldn't process document.`);
setErrored(ExitCodes.dataError);
}
return;
}
if (outputJSON) {
let result = {};
if (wantedProperties) {
for (propertyName of wantedProperties)
result[propertyName] = Properties.get(propertyName)(article, false, window);
} else {
for (const [name, func] of Properties) {
result[name] = func(article, false, window);
}
}
writeStream.write(JSON.stringify(result));
} else {
if (wantedProperties) {
for (propertyName of wantedProperties)
writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
} else {
writeStream.write(`<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">`);
if (args["style"] || !args["keep-classes"]) {
const cssHref = args["style"] || "chrome://global/skin/aboutReader.css";
writeStream.write(`
<link rel="stylesheet" href="${cssHref}" type="text/css">`);
}
writeStream.write(`
<title>${escapeHTML(Properties.get("title")(article, false, window), document)}</title>
</head>
`
);
if (!args["keep-classes"]) {
//Add a few divs and classes so that Firefox Reader Mode CSS works well
writeStream.write(`
<body class="light sans-serif loaded" style="--font-size:14pt; --content-width:40em;">
<div class="container" `
);
const contentDir = Properties.get("dir")(article, false, window);
if (contentDir)
writeStream.write(`dir="${contentDir}">`);
else
writeStream.write('>');
writeStream.write(`
<div class="header reader-header reader-show-element">
<h1 class="reader-title">${escapeHTML(Properties.get("title")(article, false, window), document)}</h1>`);
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`
<div class="credits reader-credits">${escapeHTML(author, document)}</div>`);
}
writeStream.write(`
</div>
<hr>
<div class="content">
<div class="moz-reader-content reader-show-element">
`
);
const html = Properties.get("html-content")(article, false, window);
if (!args["insane"])
writeStream.write(await sanitizeHTML(html, window));
else
writeStream.write(html);
writeStream.write(`
</div>
</div>
</div>
`
);
} else {
writeStream.write("\n<body>\n");
writeStream.write(Properties.get("html-title")(article, false, window));
writeStream.write('\n');
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`<p><i>${escapeHTML(author, document)}</i></p>`);
}
writeStream.write("\n<hr>\n");
const html = Properties.get("html-content")(article, false, window);
if (!args["insane"])
writeStream.write(await sanitizeHTML(html, window));
else
writeStream.write(html);
}
writeStream.write("\n</body></html>");
}
}
}

@ -4,7 +4,7 @@
Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
Copyright (C) 2021 gardenapple
Copyright (C) 2022 gardenapple
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@ -20,7 +20,12 @@ Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
const fs = require("fs");
const path = require("path");
const process = require("process");
const yargs = require("yargs");
const y18n = require("y18n");
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = (
@ -31,541 +36,79 @@ const locale = (
"en_US"
).replace(/[.:].*/, '');
const yargs = require("yargs");
const __ = require("y18n")({
const __ = y18n({
locale: locale,
updateFiles: false,
directory: path.resolve(__dirname, "locales")
directory: path.resolve(__dirname, "./locales")
}).__;
//JSDOM, fs, Readability, and Readability-readerable are loaded on-demand.
const ExitCodes = {
badUsageCLI: 64,
dataError: 65,
noInput: 66,
noHost: 68,
serviceUnavailable: 69,
noPermission: 77
};
let errored = false;
function setErrored(exitCode) {
process.exitCode = exitCode;
errored = true;
}
//
//Parsing arguments
//
const Properties = new Map([
["html-title", (article, singleLine, window) =>
`<h1>${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}</h1>`
],
["title", (article, singleLine) =>
singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
],
["excerpt", (article, singleLine) =>
singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt
],
["byline", (article, singleLine) =>
singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline
],
["length", article => article.length],
["dir", article => article.dir],
["text-content", article => article.textContent],
["html-content", (article, _, window) => {
if (!args["insane"]) {
const createDOMPurify = require("dompurify");
const DOMPurify = createDOMPurify(window);
return DOMPurify.sanitize(article.content);
}
return article.content;
}]
]);
const LowConfidenceMode = {
keep: "keep",
force: "force",
exit: "exit"
};
const readabilityOptions = {};
//backwards compat with old, comma-separated values
function yargsCompatProperties(args) {
if (args["properties"]) {
for (var i = 0; i < args["properties"].length; i++) {
const property = args["properties"][i];
if (property.indexOf(',') > -1) {
const split = args["properties"][i].split(',');
args["properties"].splice(i, 1, ...split);
continue;
}
if (!Properties.has(property)) {
args["properties"].splice(i, 1);
i--;
if (!args["--"])
args["--"] = [ property ];
else
args["--"].push(property);
}
}
}
}
//Positional sometimes don't get recognized when they're put
//after other arguments, I think it's an oversight in yargs.
function yargsFixPositional(args) {
if (args["--"]) {
if (!args["source"])
args["source"] = args["--"].shift();
args["_"] = args["--"];
}
}
let args = yargs
.version(false)
.command("* [source]", __`Process HTML input`, (yargs) => {
yargs.positional("source", {
desc: __`A file, an http(s) URL, or '-' for standard input`,
type: "string"
});
})
.completion('--completion', false)
.middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy
.option("help", {
alias: 'h',
type: "boolean",
desc: __`Show help`
})
.option("completion", {
type: "boolean",
desc: __`Print script for bash/zsh completion`
})
.option("base", {
alias: 'b',
type: "string",
desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)`
})
.option("insane", {
alias: 'S',
type: "boolean",
desc: __`Don't sanitize HTML`
})
.option("insecure", {
alias: 'K',
type: "boolean",
desc: __`Allow invalid SSL certificates`
})
.option("is-file", {
alias: 'f',
type: "boolean",
desc: __`Interpret SOURCE as a file name rather than a URL`,
default: false,
hidden: true,
//deprecated: true
})
.option("is-url", {
alias: 'U',
type: "boolean",
desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`,
hidden: true,
//deprecated: true
})
.option("json", {
alias: 'j',
type: "boolean",
desc: __`Output properties as a JSON payload`
})
.option("low-confidence", {
alias: 'l',
type: "string",
desc: __`What to do if Readability.js is uncertain about what the core content actually is`,
choices: ["keep", "force", "exit"]
//default: "no-op", //don't set default because completion won't work
})
.option("keep-classes", {
alias: 'C',
type: "boolean",
desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode`
})
.option("output", {
alias: 'o',
type: "string",
desc: __`The file to which the result should be output`
})
.option("properties", {
alias: 'p',
type: "array",
desc: __`Output specific properties of the parsed article`,
choices: Array.from(Properties.keys())
})
.option("proxy", {
alias: 'x',
type: "string",
desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`
})
.option("quiet", {
alias: 'q',
type: "boolean",
desc: __`Don't output extra information to stderr`
})
.option("style", {
alias: 's',
type: "string",
desc: __`Specify .css file for stylesheet`
})
.option("url", {
alias: 'u',
type: "string",
desc: __`(deprecated) alias for --base`,
hidden: true,
//deprecated: true //completion script does not respect this value, so just say it in the description
})
.option("user-agent", {
alias: 'A',
type: "string",
desc: __`Set custom user agent string`
})
.option("version", {
alias: 'V',
type: "boolean",
desc: __`Print version`
})
.epilogue(__`See the manual for more info: man readability-cli`)
.wrap(Math.min(yargs.terminalWidth(), 100))
.strict()
.parse();
if (args["is-url"]) {
console.error(__`Note: --is-url option is deprecated.`);
}
if (args["url"]) {
console.error(__`Note: --url option is deprecated, please use --base instead.`);
args["base"] = args["url"];
}
function printUsage() {
yargs.showHelp();
}
if (args["completion"]) {
yargs.showCompletionScript();
return;
}
const { Readability, isProbablyReaderable } = require("@mozilla/readability");
if (args["version"]) {
function printVersion() {
console.log(`readability-cli v${require("./package.json").version}`);
console.log(`Node.js ${process.version}`);
return;
}
if (args["keep-classes"]) {
readabilityOptions["keepClasses"] = true;
}
if (!args["low-confidence"]) {
args["low-confidence"] = LowConfidenceMode.keep;
args['l'] = LowConfidenceMode.keep;
} else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) {
console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`);
console.error(__`Use --help for more info.`);
setErrored(ExitCodes.badUsageCLI);
return;
}
async function parseDOMFromURL(url, proxy, strictSSL, userAgent) {
const { JSDOM, ResourceLoader } = require("jsdom");
const resourceLoader = new ResourceLoader({
proxy: proxy,
strictSSL: strictSSL,
userAgent: userAgent
});
let inputArg;
if (!args["source"]) {
if (process.stdin.isTTY) {
console.error(__`No input provided`);
printUsage();
setErrored(ExitCodes.badUsageCLI);
return;
} else {
inputArg = '-'
}
} else {
inputArg = args["source"];
const dom = await JSDOM.fromURL(url, {
resources: resourceLoader
})
return [dom.window.document, dom.window];
}
//Get input parameter, remove inputArg from args
let inputFile;
let inputURL;
let inputIsFromStdin = false;
if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1)
inputArg = "https://" + inputArg;
if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1)
inputURL = inputArg;
else if (inputArg == '-')
inputIsFromStdin = true;
else
inputFile = inputArg;
const outputArg = args["output"];
const documentURL = args["base"] || inputURL;
const outputJSON = args["json"];
const proxy = args["proxy"] || process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy;
let wantedProperties;
if (args["properties"]) {
wantedProperties = args["properties"];
async function parseDOM(html, url) {
const { JSDOM } = require("jsdom");
const dom = new JSDOM(html, { url: url });
return [dom.window.document, dom.window];
}
if (errored) {
printUsage();
return;
async function parseDOMFromFile(file, url) {
const { JSDOM } = require("jsdom");
const dom = await JSDOM.fromFile(file, {
url: url,
// workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9
contentType: "text/html; charset=utf-8"
})
return [dom.window.document, dom.window];
}
async function read(stream) {
const chunks = [];
for await (const chunk of stream){
chunks.push(chunk);
}
return Buffer.concat(chunks).toString("utf8");
async function sanitizeHTML(html, window) {
const createDOMPurify = require("dompurify");
const DOMPurify = createDOMPurify(window);
return DOMPurify.sanitize(html);
}
if (inputIsFromStdin) {
if (!args["quiet"]) {
console.error("Reading...");
if (!documentURL)
console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
}
read(process.stdin).then(result => {
const JSDOM = require("jsdom").JSDOM;
onLoadDOM(new JSDOM(result, { url: documentURL }));
});
} else {
if (!args["quiet"])
console.error(__`Retrieving...`);
const jsdom = require("jsdom");
let promiseGetHTML;
if (inputURL) {
const resourceLoader = new jsdom.ResourceLoader({
proxy: proxy,
strictSSL: !args["insecure"],
userAgent: args["user-agent"]
});
promiseGetHTML = jsdom.JSDOM.fromURL(inputURL, {
resources: resourceLoader
});
} else if (inputFile) {
promiseGetHTML = jsdom.JSDOM.fromFile(inputFile, {
url: documentURL,
// workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9
contentType: "text/html; charset=utf-8"
});
}
promiseGetHTML.then(onLoadDOM, onLoadDOMError);
async function sanitizeDOM(document, window) {
const createDOMPurify = require("dompurify");
const DOMPurify = createDOMPurify(window);
DOMPurify.sanitize(document, {IN_PLACE: true, WHOLE_DOCUMENT: true});
return document.documentElement.outerHTML;
}
const { Readability, isProbablyReaderable } = require("@mozilla/readability");
//Taken from https://stackoverflow.com/a/22706073/5701177
function escapeHTML(string, document) {
var p = document.createElement("p");
p.appendChild(document.createTextNode(string));
return p.innerHTML;
}
function onLoadDOM(dom) {
const window = dom.window
let shouldParseArticle = true;
if (args["low-confidence"] != LowConfidenceMode.force)
shouldParseArticle = isProbablyReaderable(window.document);
if (!shouldParseArticle) {
if (args["low-confidence"] == LowConfidenceMode.exit) {
console.error(__`Not sure if this document should be processed, exiting`);
setErrored(ExitCodes.dataError);
return;
} else {
if (!args["quiet"])
console.error(__`Not sure if this document should be processed. Not processing`);
if (args["json"] || wantedProperties) {
console.error(__`Can't output properties`);
setErrored(ExitCodes.dataError);
return;
}
shouldParseArticle = false;
}
}
let writeStream;
if (outputArg) {
const fs = require("fs");
writeStream = fs.createWriteStream(outputArg);
} else {
writeStream = process.stdout;
}
if (!shouldParseArticle) {
//Ignore wantedProperties, that should've thrown an error before
writeStream.write(getHTML(window));
return;
}
if (!args["quiet"])
console.error(__`Processing...`);
const reader = new Readability(window.document, readabilityOptions);
const article = reader.parse();
if (!article) {
if (args["low-confidence"] == LowConfidenceMode.keep) {
if (!args["quiet"])
console.error(__`Couldn't process document.`);
writeStream.write(getHTML(window));
} else {
console.error(__`Couldn't process document.`);
setErrored(ExitCodes.dataError);
}
return;
}
if (outputJSON) {
let result = {};
if (wantedProperties) {
for (propertyName of wantedProperties)
result[propertyName] = Properties.get(propertyName)(article, false, window);
} else {
for (const [name, func] of Properties) {
result[name] = func(article, false, window);
}
}
writeStream.write(JSON.stringify(result));
} else {
if (wantedProperties) {
for (propertyName of wantedProperties)
writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
} else {
writeStream.write(`<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">`);
if (args["style"] || !args["keep-classes"]) {
const cssHref = args["style"] || "chrome://global/skin/aboutReader.css";
writeStream.write(`
<link rel="stylesheet" href="${cssHref}" type="text/css">`);
}
writeStream.write(`
<title>${escapeHTML(Properties.get("title")(article, false, window), window.document)}</title>
</head>
`
);
if (!args["keep-classes"]) {
//Add a few divs and classes so that Firefox Reader Mode CSS works well
writeStream.write(`
<body class="light sans-serif loaded" style="--font-size:14pt; --content-width:40em;">
<div class="container" `
);
const contentDir = Properties.get("dir")(article, false, window);
if (contentDir)
writeStream.write(`dir="${contentDir}">`);
else
writeStream.write('>');
writeStream.write(`
<div class="header reader-header reader-show-element">
<h1 class="reader-title">${escapeHTML(Properties.get("title")(article, false, window), window.document)}</h1>`);
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`
<div class="credits reader-credits">${escapeHTML(author, window.document)}</div>`);
}
writeStream.write(`
</div>
<hr>
<div class="content">
<div class="moz-reader-content reader-show-element">
`
);
writeStream.write(Properties.get("html-content")(article, false, window));
writeStream.write(`
</div>
</div>
</div>
`
);
} else {
writeStream.write("\n<body>\n");
writeStream.write(Properties.get("html-title")(article, false, window));
writeStream.write('\n');
const author = Properties.get("byline")(article, false, window);
if (author) {
writeStream.write(`<p><i>${escapeHTML(author, window.document)}</i></p>`);
}
writeStream.write("\n<hr>\n");
writeStream.write(Properties.get("html-content")(article, false, window));
}
writeStream.write("\n</body></html>");
}
}
}
function onLoadDOMError(error) {
if (error.error) {
//Nested error?
error = error.error;
}
if (error instanceof TypeError && inputURL) {
console.error(__`Invalid URL: ${inputURL}`);
setErrored(ExitCodes.badUsageCLI);
} else if (error.code == "ENOENT") {
console.error(error.message);
setErrored(ExitCodes.noInput);
} else if (error.code == "EACCES") {
console.error(error.message);
setErrored(ExitCodes.noPermission);
} else if (error.code == "ENOTFOUND") {
console.error(__`Host not found: ${error.hostname}`);
setErrored(ExitCodes.noHost);
} else if (error.statusCode) {
console.error(__`Status error: ${error.response.statusMessage}`);
setErrored(ExitCodes.serviceUnavailable);
} else {
console.error(error.message);
//console.error(error);
setErrored(ExitCodes.serviceUnavailable);
}
}
function getHTML(window) {
let html = window.document.documentElement.outerHTML;
if (!args["insane"]) {
const createDOMPurify = require("dompurify");
const DOMPurify = createDOMPurify(window);
return DOMPurify.sanitize(html, {IN_PLACE: true, WHOLE_DOCUMENT: true});
}
return html;
}
import("./common.mjs").then((module) => {
const readable = module.default;
readable(
Buffer,
fs,
process,
yargs,
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
);
});

@ -34,5 +34,6 @@
"Host not found: '%s'": "Host not found: '%s'",
"Unknown mode: %s\nPlease use one of: keep, force, exit": "Unknown mode: %s\nPlease use one of: keep, force, exit",
"Use --help for more info.": "Use --help for more info.",
"See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli"
"See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli",
"Error: option --%s is not supported in Deno.": "Error: option --%s is not supported in Deno."
}

@ -34,5 +34,6 @@
"Host not found: '%s'": "Сервер не найден: '%s'",
"Unknown mode: %s\nPlease use one of: keep, force, exit": "Неизвестный режим: %s\nПожалуйста, используйте один из: keep, force, exit",
"Use --help for more info.": "Чтобы узнать больше, воспользуйтесь --help",
"See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli"
"See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli",
"Error: option --%s is not supported in Deno.": "Ошибка: параметр --%s не поддерживается в Deno."
}

14
package-lock.json generated

@ -13,7 +13,7 @@
"dompurify": "^2.3.4",
"jsdom": "^19.0.0",
"y18n": "^5.0.8",
"yargs": "^17.3.0"
"yargs": "^17.4.0"
},
"bin": {
"readable": "index.js"
@ -776,9 +776,9 @@
}
},
"node_modules/yargs": {
"version": "17.3.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz",
"integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==",
"version": "17.4.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz",
"integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==",
"dependencies": {
"cliui": "^7.0.2",
"escalade": "^3.1.1",
@ -1344,9 +1344,9 @@
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA=="
},
"yargs": {
"version": "17.3.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz",
"integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==",
"version": "17.4.0",
"resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz",
"integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==",
"requires": {
"cliui": "^7.0.2",
"escalade": "^3.1.1",

@ -32,7 +32,7 @@
"dompurify": "^2.3.4",
"jsdom": "^19.0.0",
"y18n": "^5.0.8",
"yargs": "^17.3.0"
"yargs": "^17.4.0"
},
"devDependencies": {
"marked-man": "^0.7.0"

@ -28,7 +28,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input.
`-K`, `--insecure`
* Allow invalid SSL certificates.
* (Node.js version only) Allow invalid SSL certificates.
`-j`, `--json`
@ -58,7 +58,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input.
`-x`, `--proxy` *URL*
* Use specified proxy (can also use `HTTPS_PROXY` environment variable).
* (Node.js version only) Use specified proxy. Node.js and Deno can also use `HTTPS_PROXY` environment variable.
`-q`, `--quiet`
@ -114,7 +114,7 @@ As usual, exit code 0 indicates success, and anything other than 0 is an error.
**readability-cli** supports localization, using the environment variables `LC_ALL`, `LC_MESSAGES`, `LANG` and `LANGUAGE`, in that order. Only one language at a time is supported.
`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Lowercase `https_proxy` and `http_proxy` are also recognized.
`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Node.js also recognizes lowercase `https_proxy` and `http_proxy`, for compatibility with `curl`.
## EXAMPLE

@ -0,0 +1,131 @@
#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write --allow-env=HTTPS_PROXY,LC_ALL,LC_MESSAGES,LANG,LANGUAGE --no-prompt --no-check
const version = "2.3.5"
import * as path from "https://deno.land/std@0.130.0/path/mod.ts"
import { Buffer } from "https://deno.land/std@0.130.0/node/buffer.ts"
import fs from "https://deno.land/std@0.130.0/node/fs.ts"
import process from "https://deno.land/std@0.130.0/node/process.ts"
import yargs from "https://deno.land/x/yargs@v17.4.0-deno/deno.ts"
import y18n from "https://deno.land/x/y18n@v5.0.8-deno/deno.ts"
import { initParser, DOMParser, DOMParserMimeType, Document, Element } from "https://deno.land/x/deno_dom@v0.1.21-alpha/deno-dom-wasm-noinit.ts"
import * as ammonia from "https://deno.land/x/ammonia@0.3.1/mod.ts"
import { Readability, isProbablyReaderable } from "https://esm.sh/@mozilla/readability@0.4.1?no-check"
import UserAgent from "https://esm.sh/user-agents@1.0.963"
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = (
Deno.env.get("LC_ALL") ||
Deno.env.get("LC_MESSAGES") ||
Deno.env.get("LANG") ||
Deno.env.get("LANGUAGE") ||
"en_US"
).replace(/[.:].*/, '')
const __ = y18n({
locale: locale,
updateFiles: false,
directory: path.join(path.dirname(path.fromFileUrl(import.meta.url)), "locales")
}).__
function printVersion() {
console.log(`readability-cli v${version}`)
console.log(`Deno ${Deno.version.deno}`)
}
async function parseDOMFromURL(url: string, proxy?: string, strictSSL?: boolean, userAgent?: string) {
const initParserPromise = initParser()
const userAgentString = userAgent ?? new UserAgent({ deviceCategory: "desktop" }).toString()
const response = await fetch(url, {
headers: {
"User-Agent": userAgentString
}
})
if (!response.ok) {
throw {
statusCode: response.status,
response: {
statusMessage: response.statusText
}
}
}
const text = await response.text()
await initParserPromise
const contentType = response.headers.get("Content-Type")!
let mimeType = contentType.slice(0, contentType.indexOf(';'))
if (mimeType == "text/htm")
mimeType = "text/html"
return parseDOM(text, url, mimeType as DOMParserMimeType)
}
async function parseDOM(html: string, url?: string, mimeType?: DOMParserMimeType) {
await initParser()
const document = new DOMParser().parseFromString(html, mimeType ?? "text/html")!!
const baseURLString = document.getElementsByTagName("base")[0]?.getAttribute("href") ?? url
if (baseURLString) {
const baseURL = new URL(baseURLString)
const nodes: Element[] = []
nodes.push(document.documentElement!!)
while (nodes.length > 0) {
const element = nodes.pop()!!
const href = element.getAttribute("href")
if (href) {
try {
// Try to parse absolute URL
new URL(href)
} catch (e) {
// Assume href is a relative URL
element.setAttribute("href", new URL(href, baseURL))
}
}
nodes.push(...element.children)
}
}
return [document]
}
async function parseDOMFromFile(file: string, url: string) {
const data = await Deno.readFile(file)
return parseDOM(new TextDecoder().decode(data), url)
}
async function sanitizeHTML(html: string) {
await ammonia.init()
return ammonia.clean(html)
}
async function sanitizeDOM(document: Document) {
return sanitizeHTML(document.documentElement!.outerHTML)
}
import readable from "./common.mjs"
await readable(
Buffer,
fs,
process,
yargs(Deno.args),
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
)
if (process.exitCode) {
process.exit()
}
Loading…
Cancel
Save