diff --git a/common.mjs b/common.mjs
new file mode 100644
index 0000000..fab9508
--- /dev/null
+++ b/common.mjs
@@ -0,0 +1,557 @@
+/*
+
+Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
+
+ Copyright (C) 2022 gardenapple
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+*/
+
+export default async function(
+ Buffer,
+ fs,
+ process,
+ yargs,
+ __,
+ Readability,
+ isProbablyReaderable,
+ printVersion,
+ parseDOM,
+ parseDOMFromFile,
+ parseDOMFromURL,
+ sanitizeDOM,
+ sanitizeHTML
+) {
+ let errored = false;
+
+ function setErrored(exitCode) {
+ process.exitCode = exitCode;
+ errored = true;
+ }
+
+ const ExitCodes = {
+ badUsageCLI: 64,
+ dataError: 65,
+ noInput: 66,
+ noHost: 68,
+ serviceUnavailable: 69,
+ noPermission: 77
+ };
+
+ //
+ //Parsing arguments
+ //
+
+ const Properties = new Map([
+ ["html-title", (article, singleLine, window) =>
+ `
${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)} `
+ ],
+ ["title", (article, singleLine) =>
+ singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
+ ],
+ ["excerpt", (article, singleLine) =>
+ singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt
+ ],
+ ["byline", (article, singleLine) =>
+ singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline
+ ],
+ ["length", article => article.length],
+ ["dir", article => article.dir],
+ ["text-content", article => article.textContent],
+ ["html-content", (article, _, window) => article.content]
+ ]);
+
+ const LowConfidenceMode = {
+ keep: "keep",
+ force: "force",
+ exit: "exit"
+ };
+
+ const readabilityOptions = {};
+
+ //backwards compat with old, comma-separated values
+ function yargsCompatProperties(args) {
+ if (args["properties"]) {
+ for (var i = 0; i < args["properties"].length; i++) {
+ const property = args["properties"][i];
+ if (property.indexOf(',') > -1) {
+ const split = args["properties"][i].split(',');
+ args["properties"].splice(i, 1, ...split);
+ continue;
+ }
+ if (!Properties.has(property)) {
+ args["properties"].splice(i, 1);
+ i--;
+ if (!args["--"])
+ args["--"] = [ property ];
+ else
+ args["--"].push(property);
+ }
+ }
+ }
+ }
+
+ //Positional sometimes don't get recognized when they're put
+ //after other arguments, I think it's an oversight in yargs.
+ function yargsFixPositional(args) {
+ if (args["--"]) {
+ if (!args["source"])
+ args["source"] = args["--"].shift();
+ args["_"] = args["--"];
+ }
+ }
+
+
+ let args = yargs
+ .version(false)
+ .command("* [source]", __`Process HTML input`, (yargs) => {
+ yargs.positional("source", {
+ desc: __`A file, an http(s) URL, or '-' for standard input`,
+ type: "string"
+ });
+ })
+ .completion("--completion", false)
+ .middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy
+ .option("help", {
+ alias: 'h',
+ type: "boolean",
+ desc: __`Show help`
+ })
+ .option("completion", {
+ type: "boolean",
+ desc: __`Print script for bash/zsh completion`
+ })
+ .option("base", {
+ alias: 'b',
+ type: "string",
+ desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)`
+ })
+ .option("insane", {
+ alias: 'S',
+ type: "boolean",
+ desc: __`Don't sanitize HTML`
+ })
+ .option("insecure", {
+ alias: 'K',
+ type: "boolean",
+ desc: __`Allow invalid SSL certificates`,
+ hidden: typeof Deno !== "undefined"
+ })
+ .option("is-file", {
+ alias: 'f',
+ type: "boolean",
+ desc: __`Interpret SOURCE as a file name rather than a URL`,
+ default: false,
+ hidden: true,
+ //deprecated: true
+ })
+ .option("is-url", {
+ alias: 'U',
+ type: "boolean",
+ desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`,
+ hidden: true,
+ //deprecated: true
+ })
+ .option("json", {
+ alias: 'j',
+ type: "boolean",
+ desc: __`Output properties as a JSON payload`
+ })
+ .option("low-confidence", {
+ alias: 'l',
+ type: "string",
+ desc: __`What to do if Readability.js is uncertain about what the core content actually is`,
+ choices: ["keep", "force", "exit"]
+ //default: "no-op", //don't set default because completion won't work
+ })
+ .option("keep-classes", {
+ alias: 'C',
+ type: "boolean",
+ desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode`
+ })
+ .option("output", {
+ alias: 'o',
+ type: "string",
+ desc: __`The file to which the result should be output`
+ })
+ .option("properties", {
+ alias: 'p',
+ type: "array",
+ desc: __`Output specific properties of the parsed article`,
+ choices: Array.from(Properties.keys())
+ })
+ .option("proxy", {
+ alias: 'x',
+ type: "string",
+ desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`,
+ hidden: typeof Deno !== "undefined"
+ })
+ .option("quiet", {
+ alias: 'q',
+ type: "boolean",
+ desc: __`Don't output extra information to stderr`
+ })
+ .option("style", {
+ alias: 's',
+ type: "string",
+ desc: __`Specify .css file for stylesheet`
+ })
+ .option("url", {
+ alias: 'u',
+ type: "string",
+ desc: __`(deprecated) alias for --base`,
+ hidden: true,
+ //deprecated: true //completion script does not respect this value, so just say it in the description
+ })
+ .option("user-agent", {
+ alias: 'A',
+ type: "string",
+ desc: __`Set custom user agent string`
+ })
+ .option("version", {
+ alias: 'V',
+ type: "boolean",
+ desc: __`Print version`
+ })
+ .fail((msg, err, yargs) => {
+ console.error(msg);
+ setErrored(ExitCodes.badUsageCLI);
+ })
+ .epilogue(__`See the manual for more info: man readability-cli`)
+ .wrap(Math.min(yargs.terminalWidth(), 100))
+ .strict()
+ .parse();
+
+ if (args["is-url"]) {
+ console.error(__`Note: --is-url option is deprecated.`);
+ }
+ if (args["url"]) {
+ console.error(__`Note: --url option is deprecated, please use --base instead.`);
+ args["base"] = args["url"];
+ }
+
+
+ function printUsage() {
+ yargs.showHelp();
+ }
+
+ if (args["completion"]) {
+ yargs.showCompletionScript();
+ return;
+ }
+
+ if (args["version"]) {
+ printVersion();
+ return;
+ }
+
+
+ if (typeof Deno !== "undefined") {
+ for (const option of ["insecure", "proxy"]) {
+ if (args[option]) {
+ console.error(__`Warning: option --${option} is not supported in Deno.`);
+ setErrored(ExitCodes.badUsageCLI);
+ return;
+ }
+ }
+ }
+
+
+ if (args["keep-classes"]) {
+ readabilityOptions["keepClasses"] = true;
+ }
+
+
+ if (!args["low-confidence"]) {
+ args["low-confidence"] = LowConfidenceMode.keep;
+ args['l'] = LowConfidenceMode.keep;
+ } else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) {
+ console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`);
+ console.error(__`Use --help for more info.`);
+ setErrored(ExitCodes.badUsageCLI);
+ return;
+ }
+
+
+ let inputArg;
+ if (!args["source"]) {
+ if (process.stdin.isTTY) {
+ console.error(__`No input provided`);
+ printUsage();
+ setErrored(ExitCodes.badUsageCLI);
+ return;
+ } else {
+ inputArg = '-'
+ }
+ } else {
+ inputArg = args["source"];
+ }
+
+ //Get input parameter, remove inputArg from args
+ let inputFile;
+ let inputURL;
+ let inputIsFromStdin = false;
+
+ if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1)
+ inputArg = "https://" + inputArg;
+ if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1)
+ inputURL = inputArg;
+ else if (inputArg == '-')
+ inputIsFromStdin = true;
+ else
+ inputFile = inputArg;
+
+
+ const outputArg = args["output"];
+ const documentURL = args["base"] || inputURL;
+ const outputJSON = args["json"];
+
+ let proxy = args["proxy"];
+ if (!proxy && typeof Deno === "undefined")
+ proxy = process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy;
+
+
+ let wantedProperties;
+ if (args["properties"]) {
+ wantedProperties = args["properties"];
+ }
+
+
+ if (errored) {
+ printUsage();
+ return;
+ }
+
+ async function read(stream) {
+ const chunks = [];
+ for await (const chunk of stream){
+ chunks.push(chunk);
+ }
+ return Buffer.concat(chunks).toString("utf8");
+ }
+
+
+
+ let document, window
+ try {
+ if (inputIsFromStdin) {
+ if (!args["quiet"]) {
+ console.error("Reading...");
+ if (!documentURL)
+ console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
+ }
+ const input = await read(process.stdin);
+ [document, window] = await parseDOM(result, documentURL);
+ } else {
+ if (!args["quiet"])
+ console.error(__`Retrieving...`);
+
+ let parseDOMPromise;
+ if (inputURL) {
+ parseDOMPromise = parseDOMFromURL(documentURL, proxy, !args["insecure"], args["user-agent"]);
+ } else if (inputFile) {
+ parseDOMPromise = parseDOMFromFile(inputFile, documentURL);
+ }
+ [document, window] = await parseDOMPromise;
+ }
+ } catch (error) {
+ if (error.error) {
+ //Nested error?
+ error = error.error;
+ }
+ if (error instanceof TypeError && inputURL) {
+ console.error(__`Invalid URL: ${inputURL}`);
+ setErrored(ExitCodes.badUsageCLI);
+ } else if (error.code == "ENOENT") {
+ console.error(error.message);
+ setErrored(ExitCodes.noInput);
+ } else if (error.code == "EACCES") {
+ console.error(error.message);
+ setErrored(ExitCodes.noPermission);
+ } else if (error.code == "ENOTFOUND") {
+ console.error(__`Host not found: ${error.hostname}`);
+ setErrored(ExitCodes.noHost);
+ } else if (error.statusCode) {
+ console.error(__`Status error: ${error.response.statusMessage}`);
+ setErrored(ExitCodes.serviceUnavailable);
+ } else {
+ console.error(error.message);
+ //console.error(error);
+ setErrored(ExitCodes.serviceUnavailable);
+ }
+ return;
+ }
+
+
+
+ //Taken from https://stackoverflow.com/a/22706073/5701177
+ function escapeHTML(string, document) {
+ var p = document.createElement("p");
+ p.appendChild(document.createTextNode(string));
+ return p.innerHTML;
+ }
+
+ async function getHTML(document, window) {
+ if (args["insane"])
+ return document.documentElement.outerHTML;
+ else
+ return await sanitizeDOM(document, window);
+ }
+
+ let shouldParseArticle = true;
+ if (args["low-confidence"] != LowConfidenceMode.force)
+ shouldParseArticle = isProbablyReaderable(document);
+
+ if (!shouldParseArticle) {
+ if (args["low-confidence"] == LowConfidenceMode.exit) {
+ console.error(__`Not sure if this document should be processed, exiting`);
+ setErrored(ExitCodes.dataError);
+ return;
+ } else {
+ if (!args["quiet"])
+ console.error(__`Not sure if this document should be processed. Not processing`);
+ if (args["json"] || wantedProperties) {
+ console.error(__`Can't output properties`);
+ setErrored(ExitCodes.dataError);
+ return;
+ }
+ shouldParseArticle = false;
+ }
+ }
+
+ let writeStream;
+ if (outputArg) {
+ writeStream = fs.createWriteStream(outputArg);
+ } else {
+ writeStream = process.stdout;
+ }
+
+
+ if (!shouldParseArticle) {
+ //Ignore wantedProperties, that should've thrown an error before
+ writeStream.write(await getHTML(document, window));
+ return;
+ }
+
+ if (!args["quiet"])
+ console.error(__`Processing...`);
+
+ const reader = new Readability(document, readabilityOptions);
+ const article = reader.parse();
+ if (!article) {
+ if (args["low-confidence"] == LowConfidenceMode.keep) {
+ if (!args["quiet"])
+ console.error(__`Couldn't process document.`);
+ writeStream.write(await getHTML(document, window));
+ } else {
+ console.error(__`Couldn't process document.`);
+ setErrored(ExitCodes.dataError);
+ }
+ return;
+ }
+ if (outputJSON) {
+ let result = {};
+ if (wantedProperties) {
+ for (propertyName of wantedProperties)
+ result[propertyName] = Properties.get(propertyName)(article, false, window);
+ } else {
+ for (const [name, func] of Properties) {
+ result[name] = func(article, false, window);
+ }
+ }
+ writeStream.write(JSON.stringify(result));
+ } else {
+ if (wantedProperties) {
+ for (propertyName of wantedProperties)
+ writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
+ } else {
+ writeStream.write(`
+
+
+ `);
+ if (args["style"] || !args["keep-classes"]) {
+ const cssHref = args["style"] || "chrome://global/skin/aboutReader.css";
+ writeStream.write(`
+ `);
+ }
+ writeStream.write(`
+ ${escapeHTML(Properties.get("title")(article, false, window), document)}
+
+`
+ );
+
+ if (!args["keep-classes"]) {
+ //Add a few divs and classes so that Firefox Reader Mode CSS works well
+ writeStream.write(`
+
+ `);
+ else
+ writeStream.write('>');
+
+ writeStream.write(`
+
+
+
+
+
+
+`
+ );
+ const html = Properties.get("html-content")(article, false, window);
+ if (!args["insane"])
+ writeStream.write(await sanitizeHTML(html, window));
+ else
+ writeStream.write(html);
+ writeStream.write(`
+
+
+
+`
+ );
+ } else {
+ writeStream.write("\n\n");
+ writeStream.write(Properties.get("html-title")(article, false, window));
+ writeStream.write('\n');
+
+ const author = Properties.get("byline")(article, false, window);
+ if (author) {
+ writeStream.write(`${escapeHTML(author, document)}
`);
+ }
+ writeStream.write("\n \n");
+ const html = Properties.get("html-content")(article, false, window);
+ if (!args["insane"])
+ writeStream.write(await sanitizeHTML(html, window));
+ else
+ writeStream.write(html);
+ }
+
+
+ writeStream.write("\n");
+ }
+ }
+}
diff --git a/index.js b/index.js
index e4726b0..e4ab672 100755
--- a/index.js
+++ b/index.js
@@ -4,7 +4,7 @@
Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
- Copyright (C) 2021 gardenapple
+ Copyright (C) 2022 gardenapple
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -20,7 +20,12 @@ Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library
along with this program. If not, see .
*/
+const fs = require("fs");
const path = require("path");
+const process = require("process");
+
+const yargs = require("yargs");
+const y18n = require("y18n");
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = (
@@ -31,541 +36,79 @@ const locale = (
"en_US"
).replace(/[.:].*/, '');
-const yargs = require("yargs");
-const __ = require("y18n")({
+const __ = y18n({
locale: locale,
updateFiles: false,
- directory: path.resolve(__dirname, "locales")
+ directory: path.resolve(__dirname, "./locales")
}).__;
-//JSDOM, fs, Readability, and Readability-readerable are loaded on-demand.
-
-const ExitCodes = {
- badUsageCLI: 64,
- dataError: 65,
- noInput: 66,
- noHost: 68,
- serviceUnavailable: 69,
- noPermission: 77
-};
-
-let errored = false;
-
-function setErrored(exitCode) {
- process.exitCode = exitCode;
- errored = true;
-}
-
-
-//
-//Parsing arguments
-//
-
-const Properties = new Map([
- ["html-title", (article, singleLine, window) =>
- `${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)} `
- ],
- ["title", (article, singleLine) =>
- singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
- ],
- ["excerpt", (article, singleLine) =>
- singleLine ? article.excerpt.replace(/\n+/gm, ' ') : article.excerpt
- ],
- ["byline", (article, singleLine) =>
- singleLine ? article.byline.replace(/\n+/gm, ' ') : article.byline
- ],
- ["length", article => article.length],
- ["dir", article => article.dir],
- ["text-content", article => article.textContent],
- ["html-content", (article, _, window) => {
- if (!args["insane"]) {
- const createDOMPurify = require("dompurify");
- const DOMPurify = createDOMPurify(window);
- return DOMPurify.sanitize(article.content);
- }
- return article.content;
- }]
-]);
-
-const LowConfidenceMode = {
- keep: "keep",
- force: "force",
- exit: "exit"
-};
-
-const readabilityOptions = {};
-
-//backwards compat with old, comma-separated values
-function yargsCompatProperties(args) {
- if (args["properties"]) {
- for (var i = 0; i < args["properties"].length; i++) {
- const property = args["properties"][i];
- if (property.indexOf(',') > -1) {
- const split = args["properties"][i].split(',');
- args["properties"].splice(i, 1, ...split);
- continue;
- }
- if (!Properties.has(property)) {
- args["properties"].splice(i, 1);
- i--;
- if (!args["--"])
- args["--"] = [ property ];
- else
- args["--"].push(property);
- }
- }
- }
-}
-
-//Positional sometimes don't get recognized when they're put
-//after other arguments, I think it's an oversight in yargs.
-function yargsFixPositional(args) {
- if (args["--"]) {
- if (!args["source"])
- args["source"] = args["--"].shift();
- args["_"] = args["--"];
- }
-}
-
-
-let args = yargs
- .version(false)
- .command("* [source]", __`Process HTML input`, (yargs) => {
- yargs.positional("source", {
- desc: __`A file, an http(s) URL, or '-' for standard input`,
- type: "string"
- });
- })
- .completion('--completion', false)
- .middleware([ yargsCompatProperties, yargsFixPositional ], true) //middleware seems to be buggy
- .option("help", {
- alias: 'h',
- type: "boolean",
- desc: __`Show help`
- })
- .option("completion", {
- type: "boolean",
- desc: __`Print script for bash/zsh completion`
- })
- .option("base", {
- alias: 'b',
- type: "string",
- desc: __`Set the document URL when parsing standard input or a local file (this affects relative links)`
- })
- .option("insane", {
- alias: 'S',
- type: "boolean",
- desc: __`Don't sanitize HTML`
- })
- .option("insecure", {
- alias: 'K',
- type: "boolean",
- desc: __`Allow invalid SSL certificates`
- })
- .option("is-file", {
- alias: 'f',
- type: "boolean",
- desc: __`Interpret SOURCE as a file name rather than a URL`,
- default: false,
- hidden: true,
- //deprecated: true
- })
- .option("is-url", {
- alias: 'U',
- type: "boolean",
- desc: __`(deprecated) Interpret SOURCE as a URL rather than file name`,
- hidden: true,
- //deprecated: true
- })
- .option("json", {
- alias: 'j',
- type: "boolean",
- desc: __`Output properties as a JSON payload`
- })
- .option("low-confidence", {
- alias: 'l',
- type: "string",
- desc: __`What to do if Readability.js is uncertain about what the core content actually is`,
- choices: ["keep", "force", "exit"]
- //default: "no-op", //don't set default because completion won't work
- })
- .option("keep-classes", {
- alias: 'C',
- type: "boolean",
- desc: __`Preserve all CSS classes for input elements, instead of adapting to Firefox's Reader Mode`
- })
- .option("output", {
- alias: 'o',
- type: "string",
- desc: __`The file to which the result should be output`
- })
- .option("properties", {
- alias: 'p',
- type: "array",
- desc: __`Output specific properties of the parsed article`,
- choices: Array.from(Properties.keys())
- })
- .option("proxy", {
- alias: 'x',
- type: "string",
- desc: __`Use specified proxy (can also use HTTPS_PROXY environment variable)`
- })
- .option("quiet", {
- alias: 'q',
- type: "boolean",
- desc: __`Don't output extra information to stderr`
- })
- .option("style", {
- alias: 's',
- type: "string",
- desc: __`Specify .css file for stylesheet`
- })
- .option("url", {
- alias: 'u',
- type: "string",
- desc: __`(deprecated) alias for --base`,
- hidden: true,
- //deprecated: true //completion script does not respect this value, so just say it in the description
- })
- .option("user-agent", {
- alias: 'A',
- type: "string",
- desc: __`Set custom user agent string`
- })
- .option("version", {
- alias: 'V',
- type: "boolean",
- desc: __`Print version`
- })
- .epilogue(__`See the manual for more info: man readability-cli`)
- .wrap(Math.min(yargs.terminalWidth(), 100))
- .strict()
- .parse();
-
-if (args["is-url"]) {
- console.error(__`Note: --is-url option is deprecated.`);
-}
-if (args["url"]) {
- console.error(__`Note: --url option is deprecated, please use --base instead.`);
- args["base"] = args["url"];
-}
-
-
-function printUsage() {
- yargs.showHelp();
-}
-
-if (args["completion"]) {
- yargs.showCompletionScript();
- return;
-}
-
+const { Readability, isProbablyReaderable } = require("@mozilla/readability");
-if (args["version"]) {
+function printVersion() {
console.log(`readability-cli v${require("./package.json").version}`);
console.log(`Node.js ${process.version}`);
- return;
-}
-
-
-
-if (args["keep-classes"]) {
- readabilityOptions["keepClasses"] = true;
-}
-
-
-if (!args["low-confidence"]) {
- args["low-confidence"] = LowConfidenceMode.keep;
- args['l'] = LowConfidenceMode.keep;
-} else if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) {
- console.error(__`Unknown mode: ${args["low-confidence"]}\nPlease use one of: keep, force, exit`);
- console.error(__`Use --help for more info.`);
- setErrored(ExitCodes.badUsageCLI);
- return;
}
+async function parseDOMFromURL(url, proxy, strictSSL, userAgent) {
+ const { JSDOM, ResourceLoader } = require("jsdom");
+ const resourceLoader = new ResourceLoader({
+ proxy: proxy,
+ strictSSL: strictSSL,
+ userAgent: userAgent
+ });
-let inputArg;
-if (!args["source"]) {
- if (process.stdin.isTTY) {
- console.error(__`No input provided`);
- printUsage();
- setErrored(ExitCodes.badUsageCLI);
- return;
- } else {
- inputArg = '-'
- }
-} else {
- inputArg = args["source"];
+ const dom = await JSDOM.fromURL(url, {
+ resources: resourceLoader
+ })
+ return [dom.window.document, dom.window];
}
-//Get input parameter, remove inputArg from args
-let inputFile;
-let inputURL;
-let inputIsFromStdin = false;
-
-if (args["is-url"] && inputArg.search(/^\w+:\/\//) == -1)
- inputArg = "https://" + inputArg;
-if (!args["is-file"] && inputArg.search(/^\w+:\/\//) != -1)
- inputURL = inputArg;
-else if (inputArg == '-')
- inputIsFromStdin = true;
-else
- inputFile = inputArg;
-
-
-const outputArg = args["output"];
-const documentURL = args["base"] || inputURL;
-const outputJSON = args["json"];
-
-const proxy = args["proxy"] || process.env.https_proxy || process.env.HTTPS_PROXY || process.env.http_proxy;
-
-
-let wantedProperties;
-if (args["properties"]) {
- wantedProperties = args["properties"];
+async function parseDOM(html, url) {
+ const { JSDOM } = require("jsdom");
+ const dom = new JSDOM(html, { url: url });
+ return [dom.window.document, dom.window];
}
-
-if (errored) {
- printUsage();
- return;
+async function parseDOMFromFile(file, url) {
+ const { JSDOM } = require("jsdom");
+ const dom = await JSDOM.fromFile(file, {
+ url: url,
+ // workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9
+ contentType: "text/html; charset=utf-8"
+ })
+ return [dom.window.document, dom.window];
}
-async function read(stream) {
- const chunks = [];
- for await (const chunk of stream){
- chunks.push(chunk);
- }
- return Buffer.concat(chunks).toString("utf8");
+async function sanitizeHTML(html, window) {
+ const createDOMPurify = require("dompurify");
+ const DOMPurify = createDOMPurify(window);
+ return DOMPurify.sanitize(html);
}
-
-
-if (inputIsFromStdin) {
- if (!args["quiet"]) {
- console.error("Reading...");
- if (!documentURL)
- console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
- }
- read(process.stdin).then(result => {
- const JSDOM = require("jsdom").JSDOM;
- onLoadDOM(new JSDOM(result, { url: documentURL }));
- });
-} else {
- if (!args["quiet"])
- console.error(__`Retrieving...`);
- const jsdom = require("jsdom");
-
- let promiseGetHTML;
- if (inputURL) {
- const resourceLoader = new jsdom.ResourceLoader({
- proxy: proxy,
- strictSSL: !args["insecure"],
- userAgent: args["user-agent"]
- });
- promiseGetHTML = jsdom.JSDOM.fromURL(inputURL, {
- resources: resourceLoader
- });
- } else if (inputFile) {
- promiseGetHTML = jsdom.JSDOM.fromFile(inputFile, {
- url: documentURL,
- // workaround for https://gitlab.com/gardenappl/readability-cli/-/issues/9
- contentType: "text/html; charset=utf-8"
- });
- }
-
- promiseGetHTML.then(onLoadDOM, onLoadDOMError);
+async function sanitizeDOM(document, window) {
+ const createDOMPurify = require("dompurify");
+ const DOMPurify = createDOMPurify(window);
+ DOMPurify.sanitize(document, {IN_PLACE: true, WHOLE_DOCUMENT: true});
+ return document.documentElement.outerHTML;
}
-
-const { Readability, isProbablyReaderable } = require("@mozilla/readability");
-
-//Taken from https://stackoverflow.com/a/22706073/5701177
-function escapeHTML(string, document) {
- var p = document.createElement("p");
- p.appendChild(document.createTextNode(string));
- return p.innerHTML;
-}
-
-function onLoadDOM(dom) {
- const window = dom.window
-
- let shouldParseArticle = true;
- if (args["low-confidence"] != LowConfidenceMode.force)
- shouldParseArticle = isProbablyReaderable(window.document);
-
- if (!shouldParseArticle) {
- if (args["low-confidence"] == LowConfidenceMode.exit) {
- console.error(__`Not sure if this document should be processed, exiting`);
- setErrored(ExitCodes.dataError);
- return;
- } else {
- if (!args["quiet"])
- console.error(__`Not sure if this document should be processed. Not processing`);
- if (args["json"] || wantedProperties) {
- console.error(__`Can't output properties`);
- setErrored(ExitCodes.dataError);
- return;
- }
- shouldParseArticle = false;
- }
- }
-
- let writeStream;
- if (outputArg) {
- const fs = require("fs");
- writeStream = fs.createWriteStream(outputArg);
- } else {
- writeStream = process.stdout;
- }
-
-
- if (!shouldParseArticle) {
- //Ignore wantedProperties, that should've thrown an error before
- writeStream.write(getHTML(window));
- return;
- }
-
- if (!args["quiet"])
- console.error(__`Processing...`);
-
- const reader = new Readability(window.document, readabilityOptions);
- const article = reader.parse();
- if (!article) {
- if (args["low-confidence"] == LowConfidenceMode.keep) {
- if (!args["quiet"])
- console.error(__`Couldn't process document.`);
- writeStream.write(getHTML(window));
- } else {
- console.error(__`Couldn't process document.`);
- setErrored(ExitCodes.dataError);
- }
- return;
- }
- if (outputJSON) {
- let result = {};
- if (wantedProperties) {
- for (propertyName of wantedProperties)
- result[propertyName] = Properties.get(propertyName)(article, false, window);
- } else {
- for (const [name, func] of Properties) {
- result[name] = func(article, false, window);
- }
- }
- writeStream.write(JSON.stringify(result));
- } else {
- if (wantedProperties) {
- for (propertyName of wantedProperties)
- writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
- } else {
- writeStream.write(`
-
-
- `);
- if (args["style"] || !args["keep-classes"]) {
- const cssHref = args["style"] || "chrome://global/skin/aboutReader.css";
- writeStream.write(`
- `);
- }
- writeStream.write(`
- ${escapeHTML(Properties.get("title")(article, false, window), window.document)}
-
-`
- );
-
- if (!args["keep-classes"]) {
- //Add a few divs and classes so that Firefox Reader Mode CSS works well
- writeStream.write(`
-
- `);
- else
- writeStream.write('>');
-
- writeStream.write(`
-
-
-
-
-
-
-`
- );
- writeStream.write(Properties.get("html-content")(article, false, window));
- writeStream.write(`
-
-
-
-`
- );
- } else {
- writeStream.write("\n\n");
- writeStream.write(Properties.get("html-title")(article, false, window));
- writeStream.write('\n');
-
- const author = Properties.get("byline")(article, false, window);
- if (author) {
- writeStream.write(`${escapeHTML(author, window.document)}
`);
- }
- writeStream.write("\n \n");
- writeStream.write(Properties.get("html-content")(article, false, window));
- }
-
-
- writeStream.write("\n");
- }
- }
-}
-
-function onLoadDOMError(error) {
- if (error.error) {
- //Nested error?
- error = error.error;
- }
- if (error instanceof TypeError && inputURL) {
- console.error(__`Invalid URL: ${inputURL}`);
- setErrored(ExitCodes.badUsageCLI);
- } else if (error.code == "ENOENT") {
- console.error(error.message);
- setErrored(ExitCodes.noInput);
- } else if (error.code == "EACCES") {
- console.error(error.message);
- setErrored(ExitCodes.noPermission);
- } else if (error.code == "ENOTFOUND") {
- console.error(__`Host not found: ${error.hostname}`);
- setErrored(ExitCodes.noHost);
- } else if (error.statusCode) {
- console.error(__`Status error: ${error.response.statusMessage}`);
- setErrored(ExitCodes.serviceUnavailable);
- } else {
- console.error(error.message);
- //console.error(error);
- setErrored(ExitCodes.serviceUnavailable);
- }
-}
-
-function getHTML(window) {
- let html = window.document.documentElement.outerHTML;
- if (!args["insane"]) {
- const createDOMPurify = require("dompurify");
- const DOMPurify = createDOMPurify(window);
- return DOMPurify.sanitize(html, {IN_PLACE: true, WHOLE_DOCUMENT: true});
- }
- return html;
-}
+import("./common.mjs").then((module) => {
+ const readable = module.default;
+ readable(
+ Buffer,
+ fs,
+ process,
+ yargs,
+ __,
+ Readability,
+ isProbablyReaderable,
+ printVersion,
+ parseDOM,
+ parseDOMFromFile,
+ parseDOMFromURL,
+ sanitizeDOM,
+ sanitizeHTML
+ );
+});
diff --git a/locales/en.json b/locales/en.json
index fe0f3af..a6edaf9 100644
--- a/locales/en.json
+++ b/locales/en.json
@@ -34,5 +34,6 @@
"Host not found: '%s'": "Host not found: '%s'",
"Unknown mode: %s\nPlease use one of: keep, force, exit": "Unknown mode: %s\nPlease use one of: keep, force, exit",
"Use --help for more info.": "Use --help for more info.",
- "See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli"
+ "See the manual for more info: man readability-cli": "See the manual for more info: man readability-cli",
+ "Error: option --%s is not supported in Deno.": "Error: option --%s is not supported in Deno."
}
diff --git a/locales/ru.json b/locales/ru.json
index a752137..d53c583 100644
--- a/locales/ru.json
+++ b/locales/ru.json
@@ -34,5 +34,6 @@
"Host not found: '%s'": "Сервер не найден: '%s'",
"Unknown mode: %s\nPlease use one of: keep, force, exit": "Неизвестный режим: %s\nПожалуйста, используйте один из: keep, force, exit",
"Use --help for more info.": "Чтобы узнать больше, воспользуйтесь --help",
- "See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli"
+ "See the manual for more info: man readability-cli": "Чтобы узнать больше, загляните в мануал: man readability-cli",
+ "Error: option --%s is not supported in Deno.": "Ошибка: параметр --%s не поддерживается в Deno."
}
diff --git a/package-lock.json b/package-lock.json
index a54b460..2fdbed5 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -13,7 +13,7 @@
"dompurify": "^2.3.4",
"jsdom": "^19.0.0",
"y18n": "^5.0.8",
- "yargs": "^17.3.0"
+ "yargs": "^17.4.0"
},
"bin": {
"readable": "index.js"
@@ -776,9 +776,9 @@
}
},
"node_modules/yargs": {
- "version": "17.3.0",
- "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz",
- "integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==",
+ "version": "17.4.0",
+ "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz",
+ "integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==",
"dependencies": {
"cliui": "^7.0.2",
"escalade": "^3.1.1",
@@ -1344,9 +1344,9 @@
"integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA=="
},
"yargs": {
- "version": "17.3.0",
- "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.3.0.tgz",
- "integrity": "sha512-GQl1pWyDoGptFPJx9b9L6kmR33TGusZvXIZUT+BOz9f7X2L94oeAskFYLEg/FkhV06zZPBYLvLZRWeYId29lew==",
+ "version": "17.4.0",
+ "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.4.0.tgz",
+ "integrity": "sha512-WJudfrk81yWFSOkZYpAZx4Nt7V4xp7S/uJkX0CnxovMCt1wCE8LNftPpNuF9X/u9gN5nsD7ycYtRcDf2pL3UiA==",
"requires": {
"cliui": "^7.0.2",
"escalade": "^3.1.1",
diff --git a/package.json b/package.json
index 7caece1..15e0f7c 100644
--- a/package.json
+++ b/package.json
@@ -32,7 +32,7 @@
"dompurify": "^2.3.4",
"jsdom": "^19.0.0",
"y18n": "^5.0.8",
- "yargs": "^17.3.0"
+ "yargs": "^17.4.0"
},
"devDependencies": {
"marked-man": "^0.7.0"
diff --git a/readability-cli.1.md b/readability-cli.1.md
index e8ae3fd..45b07ae 100644
--- a/readability-cli.1.md
+++ b/readability-cli.1.md
@@ -28,7 +28,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input.
`-K`, `--insecure`
-* Allow invalid SSL certificates.
+* (Node.js version only) Allow invalid SSL certificates.
`-j`, `--json`
@@ -58,7 +58,7 @@ The *SOURCE* can be a URL, a file, or '-' for standard input.
`-x`, `--proxy` *URL*
-* Use specified proxy (can also use `HTTPS_PROXY` environment variable).
+* (Node.js version only) Use specified proxy. Node.js and Deno can also use `HTTPS_PROXY` environment variable.
`-q`, `--quiet`
@@ -114,7 +114,7 @@ As usual, exit code 0 indicates success, and anything other than 0 is an error.
**readability-cli** supports localization, using the environment variables `LC_ALL`, `LC_MESSAGES`, `LANG` and `LANGUAGE`, in that order. Only one language at a time is supported.
-`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Lowercase `https_proxy` and `http_proxy` are also recognized.
+`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Node.js also recognizes lowercase `https_proxy` and `http_proxy`, for compatibility with `curl`.
## EXAMPLE
diff --git a/readable.ts b/readable.ts
new file mode 100644
index 0000000..a7a7859
--- /dev/null
+++ b/readable.ts
@@ -0,0 +1,131 @@
+#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write --allow-env=HTTPS_PROXY,LC_ALL,LC_MESSAGES,LANG,LANGUAGE --no-prompt --no-check
+
+const version = "2.3.5"
+
+import * as path from "https://deno.land/std@0.130.0/path/mod.ts"
+
+import { Buffer } from "https://deno.land/std@0.130.0/node/buffer.ts"
+import fs from "https://deno.land/std@0.130.0/node/fs.ts"
+import process from "https://deno.land/std@0.130.0/node/process.ts"
+
+import yargs from "https://deno.land/x/yargs@v17.4.0-deno/deno.ts"
+import y18n from "https://deno.land/x/y18n@v5.0.8-deno/deno.ts"
+
+import { initParser, DOMParser, DOMParserMimeType, Document, Element } from "https://deno.land/x/deno_dom@v0.1.21-alpha/deno-dom-wasm-noinit.ts"
+import * as ammonia from "https://deno.land/x/ammonia@0.3.1/mod.ts"
+
+import { Readability, isProbablyReaderable } from "https://esm.sh/@mozilla/readability@0.4.1?no-check"
+import UserAgent from "https://esm.sh/user-agents@1.0.963"
+
+// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
+const locale = (
+ Deno.env.get("LC_ALL") ||
+ Deno.env.get("LC_MESSAGES") ||
+ Deno.env.get("LANG") ||
+ Deno.env.get("LANGUAGE") ||
+ "en_US"
+).replace(/[.:].*/, '')
+
+const __ = y18n({
+ locale: locale,
+ updateFiles: false,
+ directory: path.join(path.dirname(path.fromFileUrl(import.meta.url)), "locales")
+}).__
+
+
+function printVersion() {
+ console.log(`readability-cli v${version}`)
+ console.log(`Deno ${Deno.version.deno}`)
+}
+
+async function parseDOMFromURL(url: string, proxy?: string, strictSSL?: boolean, userAgent?: string) {
+ const initParserPromise = initParser()
+
+ const userAgentString = userAgent ?? new UserAgent({ deviceCategory: "desktop" }).toString()
+ const response = await fetch(url, {
+ headers: {
+ "User-Agent": userAgentString
+ }
+ })
+ if (!response.ok) {
+ throw {
+ statusCode: response.status,
+ response: {
+ statusMessage: response.statusText
+ }
+ }
+ }
+ const text = await response.text()
+ await initParserPromise
+
+ const contentType = response.headers.get("Content-Type")!
+ let mimeType = contentType.slice(0, contentType.indexOf(';'))
+ if (mimeType == "text/htm")
+ mimeType = "text/html"
+ return parseDOM(text, url, mimeType as DOMParserMimeType)
+}
+
+async function parseDOM(html: string, url?: string, mimeType?: DOMParserMimeType) {
+ await initParser()
+ const document = new DOMParser().parseFromString(html, mimeType ?? "text/html")!!
+
+ const baseURLString = document.getElementsByTagName("base")[0]?.getAttribute("href") ?? url
+ if (baseURLString) {
+ const baseURL = new URL(baseURLString)
+ const nodes: Element[] = []
+ nodes.push(document.documentElement!!)
+
+ while (nodes.length > 0) {
+ const element = nodes.pop()!!
+ const href = element.getAttribute("href")
+ if (href) {
+ try {
+ // Try to parse absolute URL
+ new URL(href)
+ } catch (e) {
+ // Assume href is a relative URL
+ element.setAttribute("href", new URL(href, baseURL))
+ }
+ }
+
+ nodes.push(...element.children)
+ }
+ }
+ return [document]
+}
+
+async function parseDOMFromFile(file: string, url: string) {
+ const data = await Deno.readFile(file)
+ return parseDOM(new TextDecoder().decode(data), url)
+}
+
+async function sanitizeHTML(html: string) {
+ await ammonia.init()
+ return ammonia.clean(html)
+}
+
+async function sanitizeDOM(document: Document) {
+ return sanitizeHTML(document.documentElement!.outerHTML)
+}
+
+
+import readable from "./common.mjs"
+await readable(
+ Buffer,
+ fs,
+ process,
+ yargs(Deno.args),
+ __,
+ Readability,
+ isProbablyReaderable,
+ printVersion,
+ parseDOM,
+ parseDOMFromFile,
+ parseDOMFromURL,
+ sanitizeDOM,
+ sanitizeHTML
+)
+
+if (process.exitCode) {
+ process.exit()
+}