Merge pull request #135 from gijsk/links

Bug 1147584 - Don't strip unlikely <a>s, and replace useless <a>s with textContent
pull/146/head
Margaret Leibovic 10 years ago
commit eb7ec7231e

@ -48,6 +48,12 @@
"'": "&apos;",
};
function encodeTextContentHTML(s) {
return s.replace(/[&<>]/g, function(x) {
return reverseEntityTable[x];
});
}
function encodeHTML(s) {
return s.replace(/[&<>'"]/g, function(x) {
return reverseEntityTable[x];
@ -535,7 +541,7 @@
},
get innerHTML() {
if (typeof this._innerHTML === "undefined") {
this._innerHTML = encodeHTML(this._textContent || "");
this._innerHTML = encodeTextContentHTML(this._textContent || "");
}
return this._innerHTML;
},
@ -583,7 +589,13 @@
createElement: function (tag) {
var node = new Element(tag);
return node;
}
},
createTextNode: function (text) {
var node = new Text();
node.textContent = text;
return node;
},
};
var Element = function (tag) {

@ -197,20 +197,28 @@ Readability.prototype = {
return pathBase + uri;
}
function convertRelativeURIs(tagName, propName) {
var elems = articleContent.getElementsByTagName(tagName);
this._forEachNode(elems, function(elem) {
var relativeURI = elem.getAttribute(propName);
if (relativeURI != null)
elem.setAttribute(propName, toAbsoluteURI(relativeURI));
});
}
// Fix links.
convertRelativeURIs.call(this, "a", "href");
var links = articleContent.getElementsByTagName("a");
this._forEachNode(links, function(link) {
var href = link.getAttribute("href");
if (href) {
// Replace links with javascript: URIs with text content, since
// they won't work after scripts have been removed from the page.
if (href.indexOf("javascript:") === 0) {
var text = this._doc.createTextNode(link.textContent);
link.parentNode.replaceChild(text, link);
} else {
link.setAttribute("href", toAbsoluteURI(href));
}
}
});
// Fix images.
convertRelativeURIs.call(this, "img", "src");
var imgs = articleContent.getElementsByTagName("img");
this._forEachNode(imgs, function(img) {
var src = img.getAttribute("src");
if (src) {
img.setAttribute("src", toAbsoluteURI(src));
}
});
},
/**
@ -587,7 +595,8 @@ Readability.prototype = {
if (stripUnlikelyCandidates) {
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
node.tagName !== "BODY") {
node.tagName !== "BODY" &&
node.tagName !== "A") {
this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node);
continue;

@ -218,8 +218,10 @@ describe("Test HTML escaping", function() {
// let's manipulate via textContent in order to test that it alters
// the innerHTML correctly.
txtNode.textContent = txtNode.textContent + " ";
expect("<p>" + txtNode.innerHTML + "</p>").eql(baseStr.replace("</p>", " </p>"));
expect("<p>" + p.innerHTML + "</p>").eql(baseStr.replace("</p>", " </p>"));
txtNode.textContent = txtNode.textContent.trim();
var expectedHTML = baseStr.replace("&quot;", '"').replace("&apos;", "'");
expect("<p>" + txtNode.innerHTML + "</p>").eql(expectedHTML);
expect("<p>" + p.innerHTML + "</p>").eql(expectedHTML);
});

@ -0,0 +1,6 @@
{
"title": "Bartleby the Scrivener Web Study Text",
"byline": null,
"excerpt": "Ere introducing the scrivener, as he first appeared to me, it is fit \n I make some mention of myself, my employees, my business, my chambers, \n and general surroundings; because some such description is indispensable \n to an adequate understanding of the chief character about to be presented.",
"readerable": true
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -31,4 +31,4 @@
</p><a name="continued"></a>
</div>
</div>
</div>
</div>
Loading…
Cancel
Save