diff --git a/index.js b/index.js
index 74fc312..f89081e 100755
--- a/index.js
+++ b/index.js
@@ -28,14 +28,14 @@ const locale = (
process.env.LC_MESSAGES ||
process.env.LANG ||
process.env.LANGUAGE ||
- 'en_US'
+ "en_US"
).replace(/[.:].*/, '');
const yargs = require("yargs");
const __ = require("y18n")({
locale: locale,
updateFiles: false,
- directory: path.resolve(__dirname, 'locales')
+ directory: path.resolve(__dirname, "locales")
}).__;
//JSDOM, fs, Readability, and Readability-readerable are loaded on-demand.
@@ -62,8 +62,8 @@ function setErrored(exitCode) {
//
const Properties = new Map([
- ["html-title", (article, singleLine, document) =>
- `
${escapeHTML(Properties.get('title')(article, singleLine, document), document)}
`
+ ["html-title", (article, singleLine, window) =>
+ `${escapeHTML(Properties.get("title")(article, singleLine, window), window.document)}
`
],
["title", (article, singleLine) =>
singleLine ? article.title.replace(/\n+/gm, ' ') : article.title
@@ -76,8 +76,15 @@ const Properties = new Map([
],
["length", article => article.length],
["dir", article => article.dir],
- ["html-content", article => article.content],
- ["text-content", article => article.textContent]
+ ["text-content", article => article.textContent],
+ ["html-content", (article, _, window) => {
+ if (!args["insane"]) {
+ const createDOMPurify = require("dompurify");
+ const DOMPurify = createDOMPurify(window);
+ return DOMPurify.sanitize(article.content);
+ }
+ return article.content;
+ }]
]);
const LowConfidenceMode = {
@@ -219,6 +226,11 @@ let args = yargs
hidden: true,
//deprecated: true
})
+ .option("insane", {
+ alias: 'S',
+ type: "boolean",
+ desc: __`Don't sanitize HTML`
+ })
.option("json", {
alias: 'j',
type: "boolean",
@@ -309,9 +321,9 @@ else
inputFile = inputArg;
-const outputArg = args['output'];
+const outputArg = args["output"];
const documentURL = args["base"] || inputURL;
-const outputJSON = args['json'];
+const outputJSON = args["json"];
let wantedProperties = [];
@@ -321,7 +333,7 @@ if (args["properties"]) {
wantedProperties = args["properties"];
wantedPropertiesCustom = true;
} else {
- wantedProperties = [ 'html-title', 'html-content' ];
+ wantedProperties = [ "html-title", "html-content" ];
}
@@ -335,13 +347,11 @@ async function read(stream) {
for await (const chunk of stream){
chunks.push(chunk);
}
- return Buffer.concat(chunks).toString('utf8');
+ return Buffer.concat(chunks).toString("utf8");
}
-const JSDOM = require("jsdom").JSDOM;
-
if (inputIsFromStdin) {
if (!args["quiet"]) {
console.error("Reading...");
@@ -349,11 +359,13 @@ if (inputIsFromStdin) {
console.error(__`Warning: piping input with unknown URL. This means that relative links will be broken. Supply the --base parameter to fix.`)
}
read(process.stdin).then(result => {
+ const JSDOM = require("jsdom").JSDOM;
onLoadDOM(new JSDOM(result, { url: documentURL }));
});
} else {
if (!args["quiet"])
console.error(__`Retrieving...`);
+ const JSDOM = require("jsdom").JSDOM;
let promiseGetHTML;
if (inputURL) {
promiseGetHTML = JSDOM.fromURL(inputURL)
@@ -379,7 +391,8 @@ function escapeHTML(string, document) {
}
function onLoadDOM(dom) {
- const document = dom.window.document;
+ const window = dom.window
+ const document = window.document;
let shouldParseArticle = true;
if (args["low-confidence"] != LowConfidenceMode.force)
@@ -413,7 +426,12 @@ function onLoadDOM(dom) {
if (!shouldParseArticle) {
//Ignore wantedProperties, that should've thrown an error before
- writeStream.write(document.documentElement.outerHTML);
+ const createDOMPurify = require("dompurify");
+ const DOMPurify = createDOMPurify(window);
+ let outputHTML = document.documentElement.outerHTML;
+ if (!args["insane"])
+ outputHTML = DOMPurify.sanitize(outputHTML, {WHOLE_DOCUMENT: true});
+ writeStream.write(outputHTML);
return;
}
@@ -431,16 +449,16 @@ function onLoadDOM(dom) {
let result = {};
if (wantedPropertiesCustom) {
for (propertyName of wantedProperties)
- result[propertyName] = Properties.get(propertyName)(article, false, document);
+ result[propertyName] = Properties.get(propertyName)(article, false, window);
} else {
for (const [name, func] of Properties) {
- result[name] = func(article, false, document);
+ result[name] = func(article, false, window);
}
}
writeStream.write(JSON.stringify(result));
} else {
for (propertyName of wantedProperties)
- writeStream.write(Properties.get(propertyName)(article, true, document) + '\n');
+ writeStream.write(Properties.get(propertyName)(article, true, window) + '\n');
}
}
diff --git a/locales/en.json b/locales/en.json
index a935c35..27eb114 100644
--- a/locales/en.json
+++ b/locales/en.json
@@ -8,6 +8,7 @@
"What to do if Readability.js is uncertain about what the core content actually is": "What to do if Readability.js is uncertain about what the core content actually is",
"Output specific properties of the parsed article": "Output specific properties of the parsed article",
"Don't output extra information to stderr": "Don't output extra information to stderr",
+ "Don't sanitize HTML": "Don't sanitize HTML",
"Set the document URL when parsing standard input or a local file (this affects relative links)": "Set the document URL when parsing standard input or a local file (this affects relative links)",
"(deprecated) alias for --base": "(deprecated) alias for --base",
"Interpret SOURCE as a file name rather than a URL": "Interpret SOURCE as a file name rather than a URL",
diff --git a/locales/ru.json b/locales/ru.json
index 62e2159..81a29b4 100644
--- a/locales/ru.json
+++ b/locales/ru.json
@@ -8,6 +8,7 @@
"What to do if Readability.js is uncertain about what the core content actually is": "Что делать, когда Readability не может определить целевой контент",
"Output specific properties of the parsed article": "Показать определённые характеристики текста",
"Don't output extra information to stderr": "Не выдавать лишнюю информацию в стандартный вывод ошибок",
+ "Don't sanitize HTML": "Не убирать \"опасные\" элементы из HTML",
"Set the document URL when parsing standard input or a local file (this affects relative links)": "Указать URL документа при чтении из локального файла или стандартного ввода (влияет на относительные ссылки)",
"(deprecated) alias for --base": "(устаревшый) синоним для --base",
"Interpret SOURCE as a file name rather than a URL": "Интерпретировать [source] как файл, а не как URL",
diff --git a/package-lock.json b/package-lock.json
index 7aa15a6..432d49f 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,6 +1,6 @@
{
"name": "readability-cli",
- "version": "2.0.0-pre",
+ "version": "2.0.0-pre.2",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@@ -207,6 +207,11 @@
}
}
},
+ "dompurify": {
+ "version": "2.2.6",
+ "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.2.6.tgz",
+ "integrity": "sha512-7b7ZArhhH0SP6W2R9cqK6RjaU82FZ2UPM7RO8qN1b1wyvC/NY1FNWcX1Pu00fFOAnzEORtwXe4bPaClg6pUybQ=="
+ },
"ecc-jsbn": {
"version": "0.1.2",
"resolved": "https://registry.npmjs.org/ecc-jsbn/-/ecc-jsbn-0.1.2.tgz",
diff --git a/package.json b/package.json
index 10875f3..21ea00d 100644
--- a/package.json
+++ b/package.json
@@ -25,8 +25,9 @@
"license": "GPL-3.0-only",
"dependencies": {
"@mozilla/readability": "^0.4.1",
+ "dompurify": "^2.2.6",
"jsdom": "^16.4.0",
- "yargs": "github:gardenappl/yargs",
- "y18n": "^5.0.5"
+ "y18n": "^5.0.5",
+ "yargs": "github:gardenappl/yargs"
}
}