From a91521846c3ea42b64d861a4d61982e7b1658173 Mon Sep 17 00:00:00 2001 From: gardenapple Date: Tue, 26 Jan 2021 23:14:31 +0200 Subject: [PATCH] Sanitize HTML by default --- index.js | 52 +++++++++++++++++++++++++++++++---------------- locales/en.json | 1 + locales/ru.json | 1 + package-lock.json | 7 ++++++- package.json | 5 +++-- 5 files changed, 46 insertions(+), 20 deletions(-) diff --git a/index.js b/index.js index 74fc312..f89081e 100755 --- a/index.js +++ b/index.js @@ -28,14 +28,14 @@ const locale = ( process.env.LC_MESSAGES || process.env.LANG || process.env.LANGUAGE || - 'en_US' + "en_US" ).replace(/[.:].*/, ''); const yargs = require("yargs"); const __ = require("y18n")({ locale: locale, updateFiles: false, - directory: path.resolve(__dirname, 'locales') + directory: path.resolve(__dirname, "locales") }).__; //JSDOM, fs, Readability, and Readability-readerable are loaded on-demand. @@ -62,8 +62,8 @@ function setErrored(exitCode) { // const Properties = new Map([ - ["html-title", (article, singleLine, document) => - `

${escapeHTML(Properties.get('title')(article, singleLine, document), document)}

` + ["html-title", (article, singleLine, window) => + `

${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}

` ], ["title", (article, singleLine) => singleLine ? article.title.replace(/\n+/gm, ' ') : article.title @@ -76,8 +76,15 @@ const Properties = new Map([ ], ["length", article => article.length], ["dir", article => article.dir], - ["html-content", article => article.content], - ["text-content", article => article.textContent] + ["text-content", article => article.textContent], + ["html-content", (article, _, window) => { + if (!args["insane"]) { + const createDOMPurify = require("dompurify"); + const DOMPurify = createDOMPurify(window); + return DOMPurify.sanitize(article.content); + } + return article.content; + }] ]); const LowConfidenceMode = { @@ -219,6 +226,11 @@ let args = yargs hidden: true, //deprecated: true }) + .option("insane", { + alias: 'S', + type: "boolean", + desc: __`Don't sanitize HTML` + }) .option("json", { alias: 'j', type: "boolean", @@ -309,9 +321,9 @@ else inputFile = inputArg; -const outputArg = args['output']; +const outputArg = args["output"]; const documentURL = args["base"] || inputURL; -const outputJSON = args['json']; +const outputJSON = args["json"]; let wantedProperties = []; @@ -321,7 +333,7 @@ if (args["properties"]) { wantedProperties = args["properties"]; wantedPropertiesCustom = true; } else { - wantedProperties = [ 'html-title', 'html-content' ]; + wantedProperties = [ "html-title", "html-content" ]; } @@ -335,13 +347,11 @@ async function read(stream) { for await (const chunk of stream){ chunks.push(chunk); } - return Buffer.concat(chunks).toString('utf8'); + return Buffer.concat(chunks).toString("utf8"); } -const JSDOM = require("jsdom").JSDOM; - if (inputIsFromStdin) { if (!args["quiet"]) { console.error("Reading..."); @@ -349,11 +359,13 @@ if (inputIsFromStdin) { console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`) } read(process.stdin).then(result => { + const JSDOM = require("jsdom").JSDOM; onLoadDOM(new JSDOM(result, { url: documentURL })); }); } else { if (!args["quiet"]) console.error(__`Retrieving...`); + const JSDOM = require("jsdom").JSDOM; let promiseGetHTML; if (inputURL) { promiseGetHTML = JSDOM.fromURL(inputURL) @@ -379,7 +391,8 @@ function escapeHTML(string, document) { } function onLoadDOM(dom) { - const document = dom.window.document; + const window = dom.window + const document = window.document; let shouldParseArticle = true; if (args["low-confidence"] != LowConfidenceMode.force) @@ -413,7 +426,12 @@ function onLoadDOM(dom) { if (!shouldParseArticle) { //Ignore wantedProperties, that should've thrown an error before - writeStream.write(document.documentElement.outerHTML); + const createDOMPurify = require("dompurify"); + const DOMPurify = createDOMPurify(window); + let outputHTML = document.documentElement.outerHTML; + if (!args["insane"]) + outputHTML = DOMPurify.sanitize(outputHTML, {WHOLE_DOCUMENT: true}); + writeStream.write(outputHTML); return; } @@ -431,16 +449,16 @@ function onLoadDOM(dom) { let result = {}; if (wantedPropertiesCustom) { for (propertyName of wantedProperties) - result[propertyName] = Properties.get(propertyName)(article, false, document); + result[propertyName] = Properties.get(propertyName)(article, false, window); } else { for (const [name, func] of Properties) { - result[name] = func(article, false, document); + result[name] = func(article, false, window); } } writeStream.write(JSON.stringify(result)); } else { for (propertyName of wantedProperties) - writeStream.write(Properties.get(propertyName)(article, true, document) + '\n'); + writeStream.write(Properties.get(propertyName)(article, true, window) + '\n'); } } diff --git a/locales/en.json b/locales/en.json index a935c35..27eb114 100644 --- a/locales/en.json +++ b/locales/en.json @@ -8,6 +8,7 @@ "What to do if Readability.js is uncertain about what the core content actually is": "What to do if Readability.js is uncertain about what the core content actually is", "Output specific properties of the parsed article": "Output specific properties of the parsed article", "Don't output extra information to stderr": "Don't output extra information to stderr", + "Don't sanitize HTML": "Don't sanitize HTML", "Set the document URL when parsing standard input or a local file (this affects relative links)": "Set the document URL when parsing standard input or a local file (this affects relative links)", "(deprecated) alias for --base": "(deprecated) alias for --base", "Interpret SOURCE as a file name rather than a URL": "Interpret SOURCE as a file name rather than a URL", diff --git a/locales/ru.json b/locales/ru.json index 62e2159..81a29b4 100644 --- a/locales/ru.json +++ b/locales/ru.json @@ -8,6 +8,7 @@ "What to do if Readability.js is uncertain about what the core content actually is": "Что делать, когда Readability не может определить целевой контент", "Output specific properties of the parsed article": "Показать определённые характеристики текста", "Don't output extra information to stderr": "Не выдавать лишнюю информацию в стандартный вывод ошибок", + "Don't sanitize HTML": "Не убирать \"опасные\" элементы из HTML", "Set the document URL when parsing standard input or a local file (this affects relative links)": "Указать URL документа при чтении из локального файла или стандартного ввода (влияет на относительные ссылки)", "(deprecated) alias for --base": "(устаревшый) синоним для --base", "Interpret SOURCE as a file name rather than a URL": "Интерпретировать [source] как файл, а не как URL", diff --git a/package-lock.json b/package-lock.json index 7aa15a6..432d49f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "readability-cli", - "version": "2.0.0-pre", + "version": "2.0.0-pre.2", "lockfileVersion": 1, "requires": true, "dependencies": { @@ -207,6 +207,11 @@ } } }, + "dompurify": { + "version": "2.2.6", + "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.2.6.tgz", + "integrity": "sha512-7b7ZArhhH0SP6W2R9cqK6RjaU82FZ2UPM7RO8qN1b1wyvC/NY1FNWcX1Pu00fFOAnzEORtwXe4bPaClg6pUybQ==" + }, "ecc-jsbn": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz", diff --git a/package.json b/package.json index 10875f3..21ea00d 100644 --- a/package.json +++ b/package.json @@ -25,8 +25,9 @@ "license": "GPL-3.0-only", "dependencies": { "@mozilla/readability": "^0.4.1", + "dompurify": "^2.2.6", "jsdom": "^16.4.0", - "yargs": "github:gardenappl/yargs", - "y18n": "^5.0.5" + "y18n": "^5.0.5", + "yargs": "github:gardenappl/yargs" } }