Merge pull request #135 from gijsk/links

Bug 1147584 - Don't strip unlikely <a>s, and replace useless <a>s with textContent
This commit is contained in:
Margaret Leibovic 2015-04-13 07:00:10 -07:00
commit eb7ec7231e
7 changed files with 3161 additions and 19 deletions

View File

@ -48,6 +48,12 @@
"'": "&apos;", "'": "&apos;",
}; };
function encodeTextContentHTML(s) {
return s.replace(/[&<>]/g, function(x) {
return reverseEntityTable[x];
});
}
function encodeHTML(s) { function encodeHTML(s) {
return s.replace(/[&<>'"]/g, function(x) { return s.replace(/[&<>'"]/g, function(x) {
return reverseEntityTable[x]; return reverseEntityTable[x];
@ -535,7 +541,7 @@
}, },
get innerHTML() { get innerHTML() {
if (typeof this._innerHTML === "undefined") { if (typeof this._innerHTML === "undefined") {
this._innerHTML = encodeHTML(this._textContent || ""); this._innerHTML = encodeTextContentHTML(this._textContent || "");
} }
return this._innerHTML; return this._innerHTML;
}, },
@ -583,7 +589,13 @@
createElement: function (tag) { createElement: function (tag) {
var node = new Element(tag); var node = new Element(tag);
return node; return node;
} },
createTextNode: function (text) {
var node = new Text();
node.textContent = text;
return node;
},
}; };
var Element = function (tag) { var Element = function (tag) {

View File

@ -197,20 +197,28 @@ Readability.prototype = {
return pathBase + uri; return pathBase + uri;
} }
function convertRelativeURIs(tagName, propName) { var links = articleContent.getElementsByTagName("a");
var elems = articleContent.getElementsByTagName(tagName); this._forEachNode(links, function(link) {
this._forEachNode(elems, function(elem) { var href = link.getAttribute("href");
var relativeURI = elem.getAttribute(propName); if (href) {
if (relativeURI != null) // Replace links with javascript: URIs with text content, since
elem.setAttribute(propName, toAbsoluteURI(relativeURI)); // they won't work after scripts have been removed from the page.
}); if (href.indexOf("javascript:") === 0) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
} else {
link.setAttribute("href", toAbsoluteURI(href));
} }
}
});
// Fix links. var imgs = articleContent.getElementsByTagName("img");
convertRelativeURIs.call(this, "a", "href"); this._forEachNode(imgs, function(img) {
var src = img.getAttribute("src");
// Fix images. if (src) {
convertRelativeURIs.call(this, "img", "src"); img.setAttribute("src", toAbsoluteURI(src));
}
});
}, },
/** /**
@ -587,7 +595,8 @@ Readability.prototype = {
if (stripUnlikelyCandidates) { if (stripUnlikelyCandidates) {
if (this.REGEXPS.unlikelyCandidates.test(matchString) && if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString) && !this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
node.tagName !== "BODY") { node.tagName !== "BODY" &&
node.tagName !== "A") {
this.log("Removing unlikely candidate - " + matchString); this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node); node = this._removeAndGetNext(node);
continue; continue;

View File

@ -218,8 +218,10 @@ describe("Test HTML escaping", function() {
// let's manipulate via textContent in order to test that it alters // let's manipulate via textContent in order to test that it alters
// the innerHTML correctly. // the innerHTML correctly.
txtNode.textContent = txtNode.textContent + " "; txtNode.textContent = txtNode.textContent + " ";
expect("<p>" + txtNode.innerHTML + "</p>").eql(baseStr.replace("</p>", " </p>")); txtNode.textContent = txtNode.textContent.trim();
expect("<p>" + p.innerHTML + "</p>").eql(baseStr.replace("</p>", " </p>")); var expectedHTML = baseStr.replace("&quot;", '"').replace("&apos;", "'");
expect("<p>" + txtNode.innerHTML + "</p>").eql(expectedHTML);
expect("<p>" + p.innerHTML + "</p>").eql(expectedHTML);
}); });

View File

@ -0,0 +1,6 @@
{
"title": "Bartleby the Scrivener Web Study Text",
"byline": null,
"excerpt": "Ere introducing the scrivener, as he first appeared to me, it is fit \n I make some mention of myself, my employees, my business, my chambers, \n and general surroundings; because some such description is indispensable \n to an adequate understanding of the chief character about to be presented.",
"readerable": true
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff