diff --git a/index.js b/index.js index 2335e5d..edf367b 100644 --- a/index.js +++ b/index.js @@ -1,26 +1,28 @@ #!/usr/bin/env node /* - Firefox Reader Mode in your terminal! - CLI tool for Mozilla's Readability library - Copyright (C) 2020 gardenapple - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. + +Firefox Reader Mode in your terminal! CLI tool for Mozilla's Readability library + + Copyright (C) 2020 gardenapple - You should have received a copy of the GNU General Public License - along with this program. If not, see . + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ -const Readability = require("readability"); const JSDOM = require("jsdom").JSDOM; const parseArgs = require("minimist"); - +//fs, Readability, and Readability-readerable are loaded on-demand. +//To-do: lazy loading? const ExitCodes = { badUsageCLI: 64, @@ -49,9 +51,19 @@ Options: -o --output OUTPUT_FILE Output to OUTPUT_FILE -p --properties PROPS... Output specific properties of the parsed article -V --version Print version - -u --url Set the document URL when parsing standard input or a local file (this affects relative links and such) + -u --url Set the document URL when parsing standard input or a local file (this affects relative links) -U --is-url Interpret SOURCE as a URL rather than file name -q --quiet Don't output extra information to stderr + -l --low-confidence MODE What to do if Readability.js is uncertain about what the core content actually is + + +The --low-confidence option determines what should be done for documents where readability-cli can't determine what the core content is. + no-op When unsure, don't touch the HTML, output as-is. If the --properties option is used, this will make the program crash. + force Filter the HTML even when unsure (may produce really bad output). + exit When unsure, exit with an error. + +Default value is "no-op". + The --properties option accepts a comma-separated list of values (with no spaces in-between). Suitable values are: html-title Outputs the article's title, wrapped in an

tag. @@ -69,7 +81,7 @@ Default value is "html-title,html-content".`); -const stringArgParams = ['_', "--", "output", "properties", "url"]; +const stringArgParams = ['_', "--", "low-confidence", "output", "properties", "url"]; const boolArgParams = ["quiet", "help", "version", "is-url"]; const alias = { "output": 'o', @@ -77,14 +89,15 @@ const alias = { "version": 'V', "url": 'u', "is-url": 'U', - "quiet": 'q' + "quiet": 'q', + "low-confidence": 'l' } let args = parseArgs(process.argv.slice(2), { string: stringArgParams, boolean: boolArgParams, default: { - "properties": "html-title,html-content", + "low-confidence": "no-op", "quiet": false }, alias: alias, @@ -112,7 +125,7 @@ if (errored) { return; } -if (args.help) { +if (args["help"]) { printUsage(); return; } else if (args.version) { @@ -163,6 +176,7 @@ const outputArg = args['output']; const documentURL = args["url"] || inputURL; + const Properties = { htmlTitle: "html-title", title: "title", @@ -172,11 +186,12 @@ const Properties = { dir: "dir", htmlContent: "html-content", textContent: "text-content" -} +}; let wantedProperties = []; +let justOutputHtml = false; -if (args.properties) { - for (var property of args.properties.split(',')) { +if (args["properties"]) { + for (var property of args["properties"].split(',')) { if (Object.values(Properties).includes(property)) { wantedProperties.push(property); } else { @@ -184,10 +199,28 @@ if (args.properties) { setErrored(ExitCodes.badUsageCLI); } } - if (errored) { - printUsage(); - return; - } +} else { + wantedProperties = [ Properties.htmlTitle, Properties.htmlContent ]; + justOutputHtml = true; +} + + + +const LowConfidenceMode = { + noOp: "no-op", + force: "force", + exit: "exit" +}; +if (!Object.values(LowConfidenceMode).includes(args["low-confidence"])) { + console.error(`Invalid mode: ${args["low-confidence"]}`); + setErrored(ExitCodes.badUsageCLI); +} + + + +if (errored) { + printUsage(); + return; } async function read(stream) { @@ -240,14 +273,30 @@ function escapeHTML(string, document){ function onLoadDOM(dom) { const document = dom.window.document; - if (!args["quiet"]) - console.error("Parsing..."); - let reader = new Readability(document); - let article = reader.parse(); - if (!article) { - console.error("Couldn't parse document"); - setErrored(ExitCodes.dataError); - return; + + let shouldParseArticle = true; + + if (args["low-confidence"] != LowConfidenceMode.force) { + const Readerable = require("readability/Readability-readerable"); + + shouldParseArticle = Readerable.isProbablyReaderable(document); + } + + if (!shouldParseArticle) { + if (args["low-confidence"] == LowConfidenceMode.exit) { + console.error("Not sure if this document should be parsed, exiting"); + setErrored(ExitCodes.dataError); + return; + } else { + if (!args["quiet"]) + console.error("Not sure if this document should be parsed. Not parsing"); + if (!justOutputHtml) { + console.error("Can't output properties"); + setErrored(ExitCodes.dataError); + return; + } + shouldParseArticle = false; + } } let writeStream; @@ -258,28 +307,47 @@ function onLoadDOM(dom) { writeStream = process.stdout; } - if (wantedProperties.includes(Properties.title)) { - writeStream.write(`Title: ${article.title}\n`); - } - if (wantedProperties.includes(Properties.excerpt)) { - writeStream.write(`Excerpt: ${article.excerpt}\n`); - } - if (wantedProperties.includes(Properties.byline)) { - writeStream.write(`Author: ${article.byline}\n`); - } - if (wantedProperties.includes(Properties.length)) { - writeStream.write(`Length: ${article.length}\n`); - } - if (wantedProperties.includes(Properties.dir)) { - writeStream.write(`Direction: ${article.dir}\n`); - } - if (wantedProperties.includes(Properties.htmlTitle)) { - writeStream.write(`

${escapeHTML(article.title, document)}

\n`); - } - if (wantedProperties.includes(Properties.htmlContent)) { - writeStream.write(article.content); - } else if (wantedProperties.includes(Properties.textContent)) { - writeStream.write(article.textContent); + + if (shouldParseArticle) { + const Readability = require("readability"); + + if (!args["quiet"]) + console.error("Parsing..."); + + const reader = new Readability(document); + const article = reader.parse(); + if (!article) { + console.error("Couldn't parse document. This error usually means that the input document is empty."); + setErrored(ExitCodes.dataError); + return; + } + + if (wantedProperties.includes(Properties.title)) { + writeStream.write(`Title: ${article.title}\n`); + } + if (wantedProperties.includes(Properties.excerpt)) { + writeStream.write(`Excerpt: ${article.excerpt}\n`); + } + if (wantedProperties.includes(Properties.byline)) { + writeStream.write(`Author: ${article.byline}\n`); + } + if (wantedProperties.includes(Properties.length)) { + writeStream.write(`Length: ${article.length}\n`); + } + if (wantedProperties.includes(Properties.dir)) { + writeStream.write(`Direction: ${article.dir}\n`); + } + if (wantedProperties.includes(Properties.htmlTitle)) { + writeStream.write(`

${escapeHTML(article.title, document)}

\n`); + } + if (wantedProperties.includes(Properties.htmlContent)) { + writeStream.write(article.content); + } else if (wantedProperties.includes(Properties.textContent)) { + writeStream.write(article.textContent); + } + } else { + //Ignore wantedProperties, that should've thrown an error before + writeStream.write(document.documentElement.outerHTML); } } diff --git a/package.json b/package.json index d856391..7f8471c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@gardenapple/readability-cli", - "version": "1.0.3", + "version": "1.1.0", "description": "Firefox Reader Mode in your terminal - get useful text from a web page using Mozilla's Readability library", "main": "index.js", "bin": {