Merge pull request #135 from gijsk/links
Bug 1147584 - Don't strip unlikely <a>s, and replace useless <a>s with textContent
This commit is contained in:
commit
eb7ec7231e
@ -48,6 +48,12 @@
|
|||||||
"'": "'",
|
"'": "'",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
function encodeTextContentHTML(s) {
|
||||||
|
return s.replace(/[&<>]/g, function(x) {
|
||||||
|
return reverseEntityTable[x];
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function encodeHTML(s) {
|
function encodeHTML(s) {
|
||||||
return s.replace(/[&<>'"]/g, function(x) {
|
return s.replace(/[&<>'"]/g, function(x) {
|
||||||
return reverseEntityTable[x];
|
return reverseEntityTable[x];
|
||||||
@ -535,7 +541,7 @@
|
|||||||
},
|
},
|
||||||
get innerHTML() {
|
get innerHTML() {
|
||||||
if (typeof this._innerHTML === "undefined") {
|
if (typeof this._innerHTML === "undefined") {
|
||||||
this._innerHTML = encodeHTML(this._textContent || "");
|
this._innerHTML = encodeTextContentHTML(this._textContent || "");
|
||||||
}
|
}
|
||||||
return this._innerHTML;
|
return this._innerHTML;
|
||||||
},
|
},
|
||||||
@ -583,7 +589,13 @@
|
|||||||
createElement: function (tag) {
|
createElement: function (tag) {
|
||||||
var node = new Element(tag);
|
var node = new Element(tag);
|
||||||
return node;
|
return node;
|
||||||
}
|
},
|
||||||
|
|
||||||
|
createTextNode: function (text) {
|
||||||
|
var node = new Text();
|
||||||
|
node.textContent = text;
|
||||||
|
return node;
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
var Element = function (tag) {
|
var Element = function (tag) {
|
||||||
|
@ -197,20 +197,28 @@ Readability.prototype = {
|
|||||||
return pathBase + uri;
|
return pathBase + uri;
|
||||||
}
|
}
|
||||||
|
|
||||||
function convertRelativeURIs(tagName, propName) {
|
var links = articleContent.getElementsByTagName("a");
|
||||||
var elems = articleContent.getElementsByTagName(tagName);
|
this._forEachNode(links, function(link) {
|
||||||
this._forEachNode(elems, function(elem) {
|
var href = link.getAttribute("href");
|
||||||
var relativeURI = elem.getAttribute(propName);
|
if (href) {
|
||||||
if (relativeURI != null)
|
// Replace links with javascript: URIs with text content, since
|
||||||
elem.setAttribute(propName, toAbsoluteURI(relativeURI));
|
// they won't work after scripts have been removed from the page.
|
||||||
});
|
if (href.indexOf("javascript:") === 0) {
|
||||||
|
var text = this._doc.createTextNode(link.textContent);
|
||||||
|
link.parentNode.replaceChild(text, link);
|
||||||
|
} else {
|
||||||
|
link.setAttribute("href", toAbsoluteURI(href));
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
// Fix links.
|
var imgs = articleContent.getElementsByTagName("img");
|
||||||
convertRelativeURIs.call(this, "a", "href");
|
this._forEachNode(imgs, function(img) {
|
||||||
|
var src = img.getAttribute("src");
|
||||||
// Fix images.
|
if (src) {
|
||||||
convertRelativeURIs.call(this, "img", "src");
|
img.setAttribute("src", toAbsoluteURI(src));
|
||||||
|
}
|
||||||
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -587,7 +595,8 @@ Readability.prototype = {
|
|||||||
if (stripUnlikelyCandidates) {
|
if (stripUnlikelyCandidates) {
|
||||||
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
if (this.REGEXPS.unlikelyCandidates.test(matchString) &&
|
||||||
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
|
!this.REGEXPS.okMaybeItsACandidate.test(matchString) &&
|
||||||
node.tagName !== "BODY") {
|
node.tagName !== "BODY" &&
|
||||||
|
node.tagName !== "A") {
|
||||||
this.log("Removing unlikely candidate - " + matchString);
|
this.log("Removing unlikely candidate - " + matchString);
|
||||||
node = this._removeAndGetNext(node);
|
node = this._removeAndGetNext(node);
|
||||||
continue;
|
continue;
|
||||||
|
@ -218,8 +218,10 @@ describe("Test HTML escaping", function() {
|
|||||||
// let's manipulate via textContent in order to test that it alters
|
// let's manipulate via textContent in order to test that it alters
|
||||||
// the innerHTML correctly.
|
// the innerHTML correctly.
|
||||||
txtNode.textContent = txtNode.textContent + " ";
|
txtNode.textContent = txtNode.textContent + " ";
|
||||||
expect("<p>" + txtNode.innerHTML + "</p>").eql(baseStr.replace("</p>", " </p>"));
|
txtNode.textContent = txtNode.textContent.trim();
|
||||||
expect("<p>" + p.innerHTML + "</p>").eql(baseStr.replace("</p>", " </p>"));
|
var expectedHTML = baseStr.replace(""", '"').replace("'", "'");
|
||||||
|
expect("<p>" + txtNode.innerHTML + "</p>").eql(expectedHTML);
|
||||||
|
expect("<p>" + p.innerHTML + "</p>").eql(expectedHTML);
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
6
test/test-pages/clean-links/expected-metadata.json
Normal file
6
test/test-pages/clean-links/expected-metadata.json
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
{
|
||||||
|
"title": "Bartleby the Scrivener Web Study Text",
|
||||||
|
"byline": null,
|
||||||
|
"excerpt": "Ere introducing the scrivener, as he first appeared to me, it is fit \n I make some mention of myself, my employees, my business, my chambers, \n and general surroundings; because some such description is indispensable \n to an adequate understanding of the chief character about to be presented.",
|
||||||
|
"readerable": true
|
||||||
|
}
|
1250
test/test-pages/clean-links/expected.html
Normal file
1250
test/test-pages/clean-links/expected.html
Normal file
File diff suppressed because it is too large
Load Diff
1863
test/test-pages/clean-links/source.html
Normal file
1863
test/test-pages/clean-links/source.html
Normal file
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user