From 247425c6f500633a20d8f98255115ab7e0885931 Mon Sep 17 00:00:00 2001 From: gardenapple Date: Tue, 22 Mar 2022 16:43:58 +0200 Subject: [PATCH] Dual Node/Deno package! + fix "bad CLI usage" exit code on yargs fail --- common.mjs | 557 +++++++++++++++++++++++++++++++++++++++++ index.js | 577 +++++-------------------------------------- locales/en.json | 3 +- locales/ru.json | 3 +- package-lock.json | 14 +- package.json | 2 +- readability-cli.1.md | 6 +- readable.ts | 131 ++++++++++ 8 files changed, 763 insertions(+), 530 deletions(-) create mode 100644 common.mjs create mode 100644 readable.ts diff --git a/common.mjs b/common.mjs new file mode 100644 index 0000000..fab9508 --- /dev/null +++ b/common.mjs @@ -0,0 +1,557 @@ +/* + +Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library + + Copyright (C) 2022 gardenapple + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ + +export default async function( + Buffer, + fs, + process, + yargs, + __, + Readability, + isProbablyReaderable, + printVersion, + parseDOM, + parseDOMFromFile, + parseDOMFromURL, + sanitizeDOM, + sanitizeHTML +) { + let errored = false; + + function setErrored(exitCode) { + process.exitCode = exitCode; + errored = true; + } + + const ExitCodes = { + badUsageCLI: 64, + dataError: 65, + noInput: 66, + noHost: 68, + serviceUnavailable: 69, + noPermission: 77 + }; + + // + //Parsing arguments + // + + const Properties = new Map([ + ["html-title", (article, singleLine, window) => + `

${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}

` + ], + ["title", (article, singleLine) => + singleLine ? article.title.replace(/\n+/gm, ' ') : article.title + ], + ["excerpt", (article, singleLine) => + singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt + ], + ["byline", (article, singleLine) => + singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline + ], + ["length", article => article.length], + ["dir", article => article.dir], + ["text-content", article => article.textContent], + ["html-content", (article, _, window) => article.content] + ]); + + const LowConfidenceMode = { + keep: "keep", + force: "force", + exit: "exit" + }; + + const readabilityOptions = {}; + + //backwards compat with old, comma-separated values + function yargsCompatProperties(args) { + if (args["properties"]) { + for (var i = 0; i < args["properties"].length; i++) { + const property = args["properties"][i]; + if (property.indexOf(',') > -1) { + const split = args["properties"][i].split(','); + args["properties"].splice(i, 1, ...split); + continue; + } + if (!Properties.has(property)) { + args["properties"].splice(i, 1); + i--; + if (!args["--"]) + args["--"] = [ property ]; + else + args["--"].push(property); + } + } + } + } + + //Positional sometimes don't get recognized when they're put + //after other arguments, I think it's an oversight in yargs. + function yargsFixPositional(args) { + if (args["--"]) { + if (!args["source"]) + args["source"] = args["--"].shift(); + args["_"] = args["--"]; + } + } + + + let args = yargs + .version(false) + .command("* [source]", __`Process HTML input`, (yargs) => { + yargs.positional("source", { + desc: __`A file, an http(s) URL, or '-' for standard input`, + type: "string" + }); + }) + .completion("--completion", false) + .middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy + .option("help", { + alias: 'h', + type: "boolean", + desc: __`Show help` + }) + .option("completion", { + type: "boolean", + desc: __`Print script for bash/zsh completion` + }) + .option("base", { + alias: 'b', + type: "string", + desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)` + }) + .option("insane", { + alias: 'S', + type: "boolean", + desc: __`Don't sanitize HTML` + }) + .option("insecure", { + alias: 'K', + type: "boolean", + desc: __`Allow invalid SSL certificates`, + hidden: typeof Deno !== "undefined" + }) + .option("is-file", { + alias: 'f', + type: "boolean", + desc: __`Interpret SOURCE as a file name rather than a URL`, + default: false, + hidden: true, + //deprecated: true + }) + .option("is-url", { + alias: 'U', + type: "boolean", + desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`, + hidden: true, + //deprecated: true + }) + .option("json", { + alias: 'j', + type: "boolean", + desc: __`Output properties as a JSON payload` + }) + .option("low-confidence", { + alias: 'l', + type: "string", + desc: __`What to do if Readability.js is uncertain about what the core content actually is`, + choices: ["keep", "force", "exit"] + //default: "no-op", //don't set default because completion won't work + }) + .option("keep-classes", { + alias: 'C', + type: "boolean", + desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode` + }) + .option("output", { + alias: 'o', + type: "string", + desc: __`The file to which the result should be output` + }) + .option("properties", { + alias: 'p', + type: "array", + desc: __`Output specific properties of the parsed article`, + choices: Array.from(Properties.keys()) + }) + .option("proxy", { + alias: 'x', + type: "string", + desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`, + hidden: typeof Deno !== "undefined" + }) + .option("quiet", { + alias: 'q', + type: "boolean", + desc: __`Don't output extra information to stderr` + }) + .option("style", { + alias: 's', + type: "string", + desc: __`Specify .css file for stylesheet` + }) + .option("url", { + alias: 'u', + type: "string", + desc: __`(deprecated) alias for --base`, + hidden: true, + //deprecated: true //completion script does not respect this value, so just say it in the description + }) + .option("user-agent", { + alias: 'A', + type: "string", + desc: __`Set custom user agent string` + }) + .option("version", { + alias: 'V', + type: "boolean", + desc: __`Print version` + }) + .fail((msg, err, yargs) => { + console.error(msg); + setErrored(ExitCodes.badUsageCLI); + }) + .epilogue(__`See the manual for more info: man readability-cli`) + .wrap(Math.min(yargs.terminalWidth(), 100)) + .strict() + .parse(); + + if (args["is-url"]) { + console.error(__`Note: --is-url option is deprecated.`); + } + if (args["url"]) { + console.error(__`Note: --url option is deprecated, please use --base instead.`); + args["base"] = args["url"]; + } + + + function printUsage() { + yargs.showHelp(); + } + + if (args["completion"]) { + yargs.showCompletionScript(); + return; + } + + if (args["version"]) { + printVersion(); + return; + } + + + if (typeof Deno !== "undefined") { + for (const option of ["insecure", "proxy"]) { + if (args[option]) { + console.error(__`Warning: option --${option} is not supported in Deno.`); + setErrored(ExitCodes.badUsageCLI); + return; + } + } + } + + + if (args["keep-classes"]) { + readabilityOptions["keepClasses"] = true; + } + + + if (!args["low-confidence"]) { + args["low-confidence"] = LowConfidenceMode.keep; + args['l'] = LowConfidenceMode.keep; + } else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) { + console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`); + console.error(__`Use --help for more info.`); + setErrored(ExitCodes.badUsageCLI); + return; + } + + + let inputArg; + if (!args["source"]) { + if (process.stdin.isTTY) { + console.error(__`No input provided`); + printUsage(); + setErrored(ExitCodes.badUsageCLI); + return; + } else { + inputArg = '-' + } + } else { + inputArg = args["source"]; + } + + //Get input parameter, remove inputArg from args + let inputFile; + let inputURL; + let inputIsFromStdin = false; + + if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1) + inputArg = "https://" + inputArg; + if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1) + inputURL = inputArg; + else if (inputArg == '-') + inputIsFromStdin = true; + else + inputFile = inputArg; + + + const outputArg = args["output"]; + const documentURL = args["base"] || inputURL; + const outputJSON = args["json"]; + + let proxy = args["proxy"]; + if (!proxy && typeof Deno === "undefined") + proxy = process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy; + + + let wantedProperties; + if (args["properties"]) { + wantedProperties = args["properties"]; + } + + + if (errored) { + printUsage(); + return; + } + + async function read(stream) { + const chunks = []; + for await (const chunk of stream){ + chunks.push(chunk); + } + return Buffer.concat(chunks).toString("utf8"); + } + + + + let document, window + try { + if (inputIsFromStdin) { + if (!args["quiet"]) { + console.error("Reading..."); + if (!documentURL) + console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`) + } + const input = await read(process.stdin); + [document, window] = await parseDOM(result, documentURL); + } else { + if (!args["quiet"]) + console.error(__`Retrieving...`); + + let parseDOMPromise; + if (inputURL) { + parseDOMPromise = parseDOMFromURL(documentURL, proxy, !args["insecure"], args["user-agent"]); + } else if (inputFile) { + parseDOMPromise = parseDOMFromFile(inputFile, documentURL); + } + [document, window] = await parseDOMPromise; + } + } catch (error) { + if (error.error) { + //Nested error? + error = error.error; + } + if (error instanceof TypeError && inputURL) { + console.error(__`Invalid URL: ${inputURL}`); + setErrored(ExitCodes.badUsageCLI); + } else if (error.code == "ENOENT") { + console.error(error.message); + setErrored(ExitCodes.noInput); + } else if (error.code == "EACCES") { + console.error(error.message); + setErrored(ExitCodes.noPermission); + } else if (error.code == "ENOTFOUND") { + console.error(__`Host not found: ${error.hostname}`); + setErrored(ExitCodes.noHost); + } else if (error.statusCode) { + console.error(__`Status error: ${error.response.statusMessage}`); + setErrored(ExitCodes.serviceUnavailable); + } else { + console.error(error.message); + //console.error(error); + setErrored(ExitCodes.serviceUnavailable); + } + return; + } + + + + //Taken from https://stackoverflow.com/a/22706073/5701177 + function escapeHTML(string, document) { + var p = document.createElement("p"); + p.appendChild(document.createTextNode(string)); + return p.innerHTML; + } + + async function getHTML(document, window) { + if (args["insane"]) + return document.documentElement.outerHTML; + else + return await sanitizeDOM(document, window); + } + + let shouldParseArticle = true; + if (args["low-confidence"] != LowConfidenceMode.force) + shouldParseArticle = isProbablyReaderable(document); + + if (!shouldParseArticle) { + if (args["low-confidence"] == LowConfidenceMode.exit) { + console.error(__`Not sure if this document should be processed, exiting`); + setErrored(ExitCodes.dataError); + return; + } else { + if (!args["quiet"]) + console.error(__`Not sure if this document should be processed. Not processing`); + if (args["json"] || wantedProperties) { + console.error(__`Can't output properties`); + setErrored(ExitCodes.dataError); + return; + } + shouldParseArticle = false; + } + } + + let writeStream; + if (outputArg) { + writeStream = fs.createWriteStream(outputArg); + } else { + writeStream = process.stdout; + } + + + if (!shouldParseArticle) { + //Ignore wantedProperties, that should've thrown an error before + writeStream.write(await getHTML(document, window)); + return; + } + + if (!args["quiet"]) + console.error(__`Processing...`); + + const reader = new Readability(document, readabilityOptions); + const article = reader.parse(); + if (!article) { + if (args["low-confidence"] == LowConfidenceMode.keep) { + if (!args["quiet"]) + console.error(__`Couldn't process document.`); + writeStream.write(await getHTML(document, window)); + } else { + console.error(__`Couldn't process document.`); + setErrored(ExitCodes.dataError); + } + return; + } + if (outputJSON) { + let result = {}; + if (wantedProperties) { + for (propertyName of wantedProperties) + result[propertyName] = Properties.get(propertyName)(article, false, window); + } else { + for (const [name, func] of Properties) { + result[name] = func(article, false, window); + } + } + writeStream.write(JSON.stringify(result)); + } else { + if (wantedProperties) { + for (propertyName of wantedProperties) + writeStream.write(Properties.get(propertyName)(article, true, window) + '\n'); + } else { + writeStream.write(` + + + `); + if (args["style"] || !args["keep-classes"]) { + const cssHref = args["style"] || "chrome://global/skin/aboutReader.css"; + writeStream.write(` + `); + } + writeStream.write(` + ${escapeHTML(Properties.get("title")(article, false, window), document)} + +` + ); + + if (!args["keep-classes"]) { + //Add a few divs and classes so that Firefox Reader Mode CSS works well + writeStream.write(` + +
`); + else + writeStream.write('>'); + + writeStream.write(` +
+

${escapeHTML(Properties.get("title")(article, false, window), document)}

`); + + const author = Properties.get("byline")(article, false, window); + if (author) { + writeStream.write(` +
${escapeHTML(author, document)}
`); + } + + writeStream.write(` +
+ +
+ +
+
+` + ); + const html = Properties.get("html-content")(article, false, window); + if (!args["insane"]) + writeStream.write(await sanitizeHTML(html, window)); + else + writeStream.write(html); + writeStream.write(` +
+
+
+` + ); + } else { + writeStream.write("\n\n"); + writeStream.write(Properties.get("html-title")(article, false, window)); + writeStream.write('\n'); + + const author = Properties.get("byline")(article, false, window); + if (author) { + writeStream.write(`

${escapeHTML(author, document)}

`); + } + writeStream.write("\n
\n"); + const html = Properties.get("html-content")(article, false, window); + if (!args["insane"]) + writeStream.write(await sanitizeHTML(html, window)); + else + writeStream.write(html); + } + + + writeStream.write("\n"); + } + } +} diff --git a/index.js b/index.js index e4726b0..e4ab672 100755 --- a/index.js +++ b/index.js @@ -4,7 +4,7 @@ Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library - Copyright (C) 2021 gardenapple + Copyright (C) 2022 gardenapple This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -20,7 +20,12 @@ Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library along with this program. If not, see . */ +const fs = require("fs"); const path = require("path"); +const process = require("process"); + +const yargs = require("yargs"); +const y18n = require("y18n"); // GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs: const locale = ( @@ -31,541 +36,79 @@ const locale = ( "en_US" ).replace(/[.:].*/, ''); -const yargs = require("yargs"); -const __ = require("y18n")({ +const __ = y18n({ locale: locale, updateFiles: false, - directory: path.resolve(__dirname, "locales") + directory: path.resolve(__dirname, "./locales") }).__; -//JSDOM, fs, Readability, and Readability-readerable are loaded on-demand. - -const ExitCodes = { - badUsageCLI: 64, - dataError: 65, - noInput: 66, - noHost: 68, - serviceUnavailable: 69, - noPermission: 77 -}; - -let errored = false; - -function setErrored(exitCode) { - process.exitCode = exitCode; - errored = true; -} - - -// -//Parsing arguments -// - -const Properties = new Map([ - ["html-title", (article, singleLine, window) => - `

${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}

` - ], - ["title", (article, singleLine) => - singleLine ? article.title.replace(/\n+/gm, ' ') : article.title - ], - ["excerpt", (article, singleLine) => - singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt - ], - ["byline", (article, singleLine) => - singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline - ], - ["length", article => article.length], - ["dir", article => article.dir], - ["text-content", article => article.textContent], - ["html-content", (article, _, window) => { - if (!args["insane"]) { - const createDOMPurify = require("dompurify"); - const DOMPurify = createDOMPurify(window); - return DOMPurify.sanitize(article.content); - } - return article.content; - }] -]); - -const LowConfidenceMode = { - keep: "keep", - force: "force", - exit: "exit" -}; - -const readabilityOptions = {}; - -//backwards compat with old, comma-separated values -function yargsCompatProperties(args) { - if (args["properties"]) { - for (var i = 0; i < args["properties"].length; i++) { - const property = args["properties"][i]; - if (property.indexOf(',') > -1) { - const split = args["properties"][i].split(','); - args["properties"].splice(i, 1, ...split); - continue; - } - if (!Properties.has(property)) { - args["properties"].splice(i, 1); - i--; - if (!args["--"]) - args["--"] = [ property ]; - else - args["--"].push(property); - } - } - } -} - -//Positional sometimes don't get recognized when they're put -//after other arguments, I think it's an oversight in yargs. -function yargsFixPositional(args) { - if (args["--"]) { - if (!args["source"]) - args["source"] = args["--"].shift(); - args["_"] = args["--"]; - } -} - - -let args = yargs - .version(false) - .command("* [source]", __`Process HTML input`, (yargs) => { - yargs.positional("source", { - desc: __`A file, an http(s) URL, or '-' for standard input`, - type: "string" - }); - }) - .completion('--completion', false) - .middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy - .option("help", { - alias: 'h', - type: "boolean", - desc: __`Show help` - }) - .option("completion", { - type: "boolean", - desc: __`Print script for bash/zsh completion` - }) - .option("base", { - alias: 'b', - type: "string", - desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)` - }) - .option("insane", { - alias: 'S', - type: "boolean", - desc: __`Don't sanitize HTML` - }) - .option("insecure", { - alias: 'K', - type: "boolean", - desc: __`Allow invalid SSL certificates` - }) - .option("is-file", { - alias: 'f', - type: "boolean", - desc: __`Interpret SOURCE as a file name rather than a URL`, - default: false, - hidden: true, - //deprecated: true - }) - .option("is-url", { - alias: 'U', - type: "boolean", - desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`, - hidden: true, - //deprecated: true - }) - .option("json", { - alias: 'j', - type: "boolean", - desc: __`Output properties as a JSON payload` - }) - .option("low-confidence", { - alias: 'l', - type: "string", - desc: __`What to do if Readability.js is uncertain about what the core content actually is`, - choices: ["keep", "force", "exit"] - //default: "no-op", //don't set default because completion won't work - }) - .option("keep-classes", { - alias: 'C', - type: "boolean", - desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode` - }) - .option("output", { - alias: 'o', - type: "string", - desc: __`The file to which the result should be output` - }) - .option("properties", { - alias: 'p', - type: "array", - desc: __`Output specific properties of the parsed article`, - choices: Array.from(Properties.keys()) - }) - .option("proxy", { - alias: 'x', - type: "string", - desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)` - }) - .option("quiet", { - alias: 'q', - type: "boolean", - desc: __`Don't output extra information to stderr` - }) - .option("style", { - alias: 's', - type: "string", - desc: __`Specify .css file for stylesheet` - }) - .option("url", { - alias: 'u', - type: "string", - desc: __`(deprecated) alias for --base`, - hidden: true, - //deprecated: true //completion script does not respect this value, so just say it in the description - }) - .option("user-agent", { - alias: 'A', - type: "string", - desc: __`Set custom user agent string` - }) - .option("version", { - alias: 'V', - type: "boolean", - desc: __`Print version` - }) - .epilogue(__`See the manual for more info: man readability-cli`) - .wrap(Math.min(yargs.terminalWidth(), 100)) - .strict() - .parse(); - -if (args["is-url"]) { - console.error(__`Note: --is-url option is deprecated.`); -} -if (args["url"]) { - console.error(__`Note: --url option is deprecated, please use --base instead.`); - args["base"] = args["url"]; -} - - -function printUsage() { - yargs.showHelp(); -} - -if (args["completion"]) { - yargs.showCompletionScript(); - return; -} - +const { Readability, isProbablyReaderable } = require("@mozilla/readability"); -if (args["version"]) { +function printVersion() { console.log(`readability-cli v${require("./package.json").version}`); console.log(`Node.js ${process.version}`); - return; -} - - - -if (args["keep-classes"]) { - readabilityOptions["keepClasses"] = true; -} - - -if (!args["low-confidence"]) { - args["low-confidence"] = LowConfidenceMode.keep; - args['l'] = LowConfidenceMode.keep; -} else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) { - console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`); - console.error(__`Use --help for more info.`); - setErrored(ExitCodes.badUsageCLI); - return; } +async function parseDOMFromURL(url, proxy, strictSSL, userAgent) { + const { JSDOM, ResourceLoader } = require("jsdom"); + const resourceLoader = new ResourceLoader({ + proxy: proxy, + strictSSL: strictSSL, + userAgent: userAgent + }); -let inputArg; -if (!args["source"]) { - if (process.stdin.isTTY) { - console.error(__`No input provided`); - printUsage(); - setErrored(ExitCodes.badUsageCLI); - return; - } else { - inputArg = '-' - } -} else { - inputArg = args["source"]; + const dom = await JSDOM.fromURL(url, { + resources: resourceLoader + }) + return [dom.window.document, dom.window]; } -//Get input parameter, remove inputArg from args -let inputFile; -let inputURL; -let inputIsFromStdin = false; - -if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1) - inputArg = "https://" + inputArg; -if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1) - inputURL = inputArg; -else if (inputArg == '-') - inputIsFromStdin = true; -else - inputFile = inputArg; - - -const outputArg = args["output"]; -const documentURL = args["base"] || inputURL; -const outputJSON = args["json"]; - -const proxy = args["proxy"] || process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy; - - -let wantedProperties; -if (args["properties"]) { - wantedProperties = args["properties"]; +async function parseDOM(html, url) { + const { JSDOM } = require("jsdom"); + const dom = new JSDOM(html, { url: url }); + return [dom.window.document, dom.window]; } - -if (errored) { - printUsage(); - return; +async function parseDOMFromFile(file, url) { + const { JSDOM } = require("jsdom"); + const dom = await JSDOM.fromFile(file, { + url: url, + // workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9 + contentType: "text/html; charset=utf-8" + }) + return [dom.window.document, dom.window]; } -async function read(stream) { - const chunks = []; - for await (const chunk of stream){ - chunks.push(chunk); - } - return Buffer.concat(chunks).toString("utf8"); +async function sanitizeHTML(html, window) { + const createDOMPurify = require("dompurify"); + const DOMPurify = createDOMPurify(window); + return DOMPurify.sanitize(html); } - - -if (inputIsFromStdin) { - if (!args["quiet"]) { - console.error("Reading..."); - if (!documentURL) - console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`) - } - read(process.stdin).then(result => { - const JSDOM = require("jsdom").JSDOM; - onLoadDOM(new JSDOM(result, { url: documentURL })); - }); -} else { - if (!args["quiet"]) - console.error(__`Retrieving...`); - const jsdom = require("jsdom"); - - let promiseGetHTML; - if (inputURL) { - const resourceLoader = new jsdom.ResourceLoader({ - proxy: proxy, - strictSSL: !args["insecure"], - userAgent: args["user-agent"] - }); - promiseGetHTML = jsdom.JSDOM.fromURL(inputURL, { - resources: resourceLoader - }); - } else if (inputFile) { - promiseGetHTML = jsdom.JSDOM.fromFile(inputFile, { - url: documentURL, - // workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9 - contentType: "text/html; charset=utf-8" - }); - } - - promiseGetHTML.then(onLoadDOM, onLoadDOMError); +async function sanitizeDOM(document, window) { + const createDOMPurify = require("dompurify"); + const DOMPurify = createDOMPurify(window); + DOMPurify.sanitize(document, {IN_PLACE: true, WHOLE_DOCUMENT: true}); + return document.documentElement.outerHTML; } - -const { Readability, isProbablyReaderable } = require("@mozilla/readability"); - -//Taken from https://stackoverflow.com/a/22706073/5701177 -function escapeHTML(string, document) { - var p = document.createElement("p"); - p.appendChild(document.createTextNode(string)); - return p.innerHTML; -} - -function onLoadDOM(dom) { - const window = dom.window - - let shouldParseArticle = true; - if (args["low-confidence"] != LowConfidenceMode.force) - shouldParseArticle = isProbablyReaderable(window.document); - - if (!shouldParseArticle) { - if (args["low-confidence"] == LowConfidenceMode.exit) { - console.error(__`Not sure if this document should be processed, exiting`); - setErrored(ExitCodes.dataError); - return; - } else { - if (!args["quiet"]) - console.error(__`Not sure if this document should be processed. Not processing`); - if (args["json"] || wantedProperties) { - console.error(__`Can't output properties`); - setErrored(ExitCodes.dataError); - return; - } - shouldParseArticle = false; - } - } - - let writeStream; - if (outputArg) { - const fs = require("fs"); - writeStream = fs.createWriteStream(outputArg); - } else { - writeStream = process.stdout; - } - - - if (!shouldParseArticle) { - //Ignore wantedProperties, that should've thrown an error before - writeStream.write(getHTML(window)); - return; - } - - if (!args["quiet"]) - console.error(__`Processing...`); - - const reader = new Readability(window.document, readabilityOptions); - const article = reader.parse(); - if (!article) { - if (args["low-confidence"] == LowConfidenceMode.keep) { - if (!args["quiet"]) - console.error(__`Couldn't process document.`); - writeStream.write(getHTML(window)); - } else { - console.error(__`Couldn't process document.`); - setErrored(ExitCodes.dataError); - } - return; - } - if (outputJSON) { - let result = {}; - if (wantedProperties) { - for (propertyName of wantedProperties) - result[propertyName] = Properties.get(propertyName)(article, false, window); - } else { - for (const [name, func] of Properties) { - result[name] = func(article, false, window); - } - } - writeStream.write(JSON.stringify(result)); - } else { - if (wantedProperties) { - for (propertyName of wantedProperties) - writeStream.write(Properties.get(propertyName)(article, true, window) + '\n'); - } else { - writeStream.write(` - - - `); - if (args["style"] || !args["keep-classes"]) { - const cssHref = args["style"] || "chrome://global/skin/aboutReader.css"; - writeStream.write(` - `); - } - writeStream.write(` - ${escapeHTML(Properties.get("title")(article, false, window), window.document)} - -` - ); - - if (!args["keep-classes"]) { - //Add a few divs and classes so that Firefox Reader Mode CSS works well - writeStream.write(` - -
`); - else - writeStream.write('>'); - - writeStream.write(` -
-

${escapeHTML(Properties.get("title")(article, false, window), window.document)}

`); - - const author = Properties.get("byline")(article, false, window); - if (author) { - writeStream.write(` -
${escapeHTML(author, window.document)}
`); - } - - writeStream.write(` -
- -
- -
-
-` - ); - writeStream.write(Properties.get("html-content")(article, false, window)); - writeStream.write(` -
-
-
-` - ); - } else { - writeStream.write("\n\n"); - writeStream.write(Properties.get("html-title")(article, false, window)); - writeStream.write('\n'); - - const author = Properties.get("byline")(article, false, window); - if (author) { - writeStream.write(`

${escapeHTML(author, window.document)}

`); - } - writeStream.write("\n
\n"); - writeStream.write(Properties.get("html-content")(article, false, window)); - } - - - writeStream.write("\n"); - } - } -} - -function onLoadDOMError(error) { - if (error.error) { - //Nested error? - error = error.error; - } - if (error instanceof TypeError && inputURL) { - console.error(__`Invalid URL: ${inputURL}`); - setErrored(ExitCodes.badUsageCLI); - } else if (error.code == "ENOENT") { - console.error(error.message); - setErrored(ExitCodes.noInput); - } else if (error.code == "EACCES") { - console.error(error.message); - setErrored(ExitCodes.noPermission); - } else if (error.code == "ENOTFOUND") { - console.error(__`Host not found: ${error.hostname}`); - setErrored(ExitCodes.noHost); - } else if (error.statusCode) { - console.error(__`Status error: ${error.response.statusMessage}`); - setErrored(ExitCodes.serviceUnavailable); - } else { - console.error(error.message); - //console.error(error); - setErrored(ExitCodes.serviceUnavailable); - } -} - -function getHTML(window) { - let html = window.document.documentElement.outerHTML; - if (!args["insane"]) { - const createDOMPurify = require("dompurify"); - const DOMPurify = createDOMPurify(window); - return DOMPurify.sanitize(html, {IN_PLACE: true, WHOLE_DOCUMENT: true}); - } - return html; -} +import("./common.mjs").then((module) => { + const readable = module.default; + readable( + Buffer, + fs, + process, + yargs, + __, + Readability, + isProbablyReaderable, + printVersion, + parseDOM, + parseDOMFromFile, + parseDOMFromURL, + sanitizeDOM, + sanitizeHTML + ); +}); diff --git a/locales/en.json b/locales/en.json index fe0f3af..a6edaf9 100644 --- a/locales/en.json +++ b/locales/en.json @@ -34,5 +34,6 @@ "Host not found: '%s'": "Host not found: '%s'", "Unknown mode: %s\nPlease use one of: keep, force, exit": "Unknown mode: %s\nPlease use one of: keep, force, exit", "Use --help for more info.": "Use --help for more info.", - "See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli" + "See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli", + "Error: option --%s is not supported in Deno.": "Error: option --%s is not supported in Deno." } diff --git a/locales/ru.json b/locales/ru.json index a752137..d53c583 100644 --- a/locales/ru.json +++ b/locales/ru.json @@ -34,5 +34,6 @@ "Host not found: '%s'": "Сервер не найден: '%s'", "Unknown mode: %s\nPlease use one of: keep, force, exit": "Неизвестный режим: %s\nПожалуйста, используйте один из: keep, force, exit", "Use --help for more info.": "Чтобы узнать больше, воспользуйтесь --help", - "See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli" + "See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli", + "Error: option --%s is not supported in Deno.": "Ошибка: параметр --%s не поддерживается в Deno." } diff --git a/package-lock.json b/package-lock.json index a54b460..2fdbed5 100644 --- a/package-lock.json +++ b/package-lock.json @@ -13,7 +13,7 @@ "dompurify": "^2.3.4", "jsdom": "^19.0.0", "y18n": "^5.0.8", - "yargs": "^17.3.0" + "yargs": "^17.4.0" }, "bin": { "readable": "index.js" @@ -776,9 +776,9 @@ } }, "node_modules/yargs": { - "version": "17.3.0", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz", - "integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==", + "version": "17.4.0", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz", + "integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==", "dependencies": { "cliui": "^7.0.2", "escalade": "^3.1.1", @@ -1344,9 +1344,9 @@ "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==" }, "yargs": { - "version": "17.3.0", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz", - "integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==", + "version": "17.4.0", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz", + "integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==", "requires": { "cliui": "^7.0.2", "escalade": "^3.1.1", diff --git a/package.json b/package.json index 7caece1..15e0f7c 100644 --- a/package.json +++ b/package.json @@ -32,7 +32,7 @@ "dompurify": "^2.3.4", "jsdom": "^19.0.0", "y18n": "^5.0.8", - "yargs": "^17.3.0" + "yargs": "^17.4.0" }, "devDependencies": { "marked-man": "^0.7.0" diff --git a/readability-cli.1.md b/readability-cli.1.md index e8ae3fd..45b07ae 100644 --- a/readability-cli.1.md +++ b/readability-cli.1.md @@ -28,7 +28,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input. `-K`, `--insecure` -* Allow invalid SSL certificates. +* (Node.js version only) Allow invalid SSL certificates. `-j`, `--json` @@ -58,7 +58,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input. `-x`, `--proxy` *URL* -* Use specified proxy (can also use `HTTPS_PROXY` environment variable). +* (Node.js version only) Use specified proxy. Node.js and Deno can also use `HTTPS_PROXY` environment variable. `-q`, `--quiet` @@ -114,7 +114,7 @@ As usual, exit code 0 indicates success, and anything other than 0 is an error. **readability-cli** supports localization, using the environment variables `LC_ALL`, `LC_MESSAGES`, `LANG` and `LANGUAGE`, in that order. Only one language at a time is supported. -`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Lowercase `https_proxy` and `http_proxy` are also recognized. +`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Node.js also recognizes lowercase `https_proxy` and `http_proxy`, for compatibility with `curl`. ## EXAMPLE diff --git a/readable.ts b/readable.ts new file mode 100644 index 0000000..a7a7859 --- /dev/null +++ b/readable.ts @@ -0,0 +1,131 @@ +#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write --allow-env=HTTPS_PROXY,LC_ALL,LC_MESSAGES,LANG,LANGUAGE --no-prompt --no-check + +const version = "2.3.5" + +import * as path from "https://deno.land/std@0.130.0/path/mod.ts" + +import { Buffer } from "https://deno.land/std@0.130.0/node/buffer.ts" +import fs from "https://deno.land/std@0.130.0/node/fs.ts" +import process from "https://deno.land/std@0.130.0/node/process.ts" + +import yargs from "https://deno.land/x/yargs@v17.4.0-deno/deno.ts" +import y18n from "https://deno.land/x/y18n@v5.0.8-deno/deno.ts" + +import { initParser, DOMParser, DOMParserMimeType, Document, Element } from "https://deno.land/x/deno_dom@v0.1.21-alpha/deno-dom-wasm-noinit.ts" +import * as ammonia from "https://deno.land/x/ammonia@0.3.1/mod.ts" + +import { Readability, isProbablyReaderable } from "https://esm.sh/@mozilla/readability@0.4.1?no-check" +import UserAgent from "https://esm.sh/user-agents@1.0.963" + +// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs: +const locale = ( + Deno.env.get("LC_ALL") || + Deno.env.get("LC_MESSAGES") || + Deno.env.get("LANG") || + Deno.env.get("LANGUAGE") || + "en_US" +).replace(/[.:].*/, '') + +const __ = y18n({ + locale: locale, + updateFiles: false, + directory: path.join(path.dirname(path.fromFileUrl(import.meta.url)), "locales") +}).__ + + +function printVersion() { + console.log(`readability-cli v${version}`) + console.log(`Deno ${Deno.version.deno}`) +} + +async function parseDOMFromURL(url: string, proxy?: string, strictSSL?: boolean, userAgent?: string) { + const initParserPromise = initParser() + + const userAgentString = userAgent ?? new UserAgent({ deviceCategory: "desktop" }).toString() + const response = await fetch(url, { + headers: { + "User-Agent": userAgentString + } + }) + if (!response.ok) { + throw { + statusCode: response.status, + response: { + statusMessage: response.statusText + } + } + } + const text = await response.text() + await initParserPromise + + const contentType = response.headers.get("Content-Type")! + let mimeType = contentType.slice(0, contentType.indexOf(';')) + if (mimeType == "text/htm") + mimeType = "text/html" + return parseDOM(text, url, mimeType as DOMParserMimeType) +} + +async function parseDOM(html: string, url?: string, mimeType?: DOMParserMimeType) { + await initParser() + const document = new DOMParser().parseFromString(html, mimeType ?? "text/html")!! + + const baseURLString = document.getElementsByTagName("base")[0]?.getAttribute("href") ?? url + if (baseURLString) { + const baseURL = new URL(baseURLString) + const nodes: Element[] = [] + nodes.push(document.documentElement!!) + + while (nodes.length > 0) { + const element = nodes.pop()!! + const href = element.getAttribute("href") + if (href) { + try { + // Try to parse absolute URL + new URL(href) + } catch (e) { + // Assume href is a relative URL + element.setAttribute("href", new URL(href, baseURL)) + } + } + + nodes.push(...element.children) + } + } + return [document] +} + +async function parseDOMFromFile(file: string, url: string) { + const data = await Deno.readFile(file) + return parseDOM(new TextDecoder().decode(data), url) +} + +async function sanitizeHTML(html: string) { + await ammonia.init() + return ammonia.clean(html) +} + +async function sanitizeDOM(document: Document) { + return sanitizeHTML(document.documentElement!.outerHTML) +} + + +import readable from "./common.mjs" +await readable( + Buffer, + fs, + process, + yargs(Deno.args), + __, + Readability, + isProbablyReaderable, + printVersion, + parseDOM, + parseDOMFromFile, + parseDOMFromURL, + sanitizeDOM, + sanitizeHTML +) + +if (process.exitCode) { + process.exit() +}