From 8ddba60425bd3240631d56d44e44e0c0315470c2 Mon Sep 17 00:00:00 2001 From: Gijs Kruitbosch Date: Tue, 31 Mar 2015 19:23:27 +0100 Subject: [PATCH] Fix script parsing to ignore closing tags in comments --- JSDOMParser.js | 74 ++++++++++++++----- test/test-jsdomparser.js | 39 ++++++++++ .../expected-metadata.json | 5 ++ .../expected.html | 19 +++++ .../comment-inside-script-parsing/source.html | 34 +++++++++ 5 files changed, 151 insertions(+), 20 deletions(-) create mode 100644 test/test-pages/comment-inside-script-parsing/expected-metadata.json create mode 100644 test/test-pages/comment-inside-script-parsing/expected.html create mode 100644 test/test-pages/comment-inside-script-parsing/source.html diff --git a/JSDOMParser.js b/JSDOMParser.js index 34a863e..d8230ef 100644 --- a/JSDOMParser.js +++ b/JSDOMParser.js @@ -919,14 +919,59 @@ }, readScript: function (node) { - var index = this.html.indexOf("", this.currentChar); - if (index === -1) { - index = this.html.length; + while (this.currentChar < this.html.length) { + var c = this.nextChar(); + var nextC = this.peekNext(); + if (c === "<") { + if (nextC === "!" || nextC === "?") { + // We're still before the ! or ? that is starting this comment: + this.currentChar++; + node.appendChild(this.discardNextComment()); + continue; + } + if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") { + // Go back before the '<' so we find the end tag. + this.currentChar--; + // Done with this script tag, the caller will close: + return; + } + } + // Either c wasn't a '<' or it was but we couldn't find either a comment + // or a closing script tag, so we should just parse as text until the next one + // comes along: + + var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE; + var textNode = haveTextNode ? node.lastChild : new Text(); + var n = this.html.indexOf("<", this.currentChar); + // Decrement this to include the current character *afterwards* so we don't get stuck + // looking for the same < all the time. + this.currentChar--; + if (n === -1) { + textNode.textContent += this.html.substring(this.currentChar, this.html.length); + this.currentChar = this.html.length; + } else { + textNode.textContent += this.html.substring(this.currentChar, n); + this.currentChar = n; + } + if (!haveTextNode) + node.appendChild(textNode); } - var txt = new Text(); - txt.textContent = this.html.substring(this.currentChar, index === -1 ? this.html.length : index); - node.appendChild(txt); - this.currentChar = index; + }, + + discardNextComment: function() { + if (this.match("--")) { + this.discardTo("-->"); + } else { + var c = this.nextChar(); + while (c !== ">") { + if (c === undefined) + return null; + if (c === '"' || c === "'") + this.readString(c); + c = this.nextChar(); + } + } + return new Comment(); }, @@ -964,20 +1009,9 @@ // them away in readChildren()). So just returning an empty Comment node // here is sufficient. if (c === "!" || c === "?") { + // We're still before the ! or ? that is starting this comment: this.currentChar++; - if (this.match("--")) { - this.discardTo("-->"); - } else { - var c = this.nextChar(); - while (c !== ">") { - if (c === undefined) - return null; - if (c === '"' || c === "'") - this.readString(c); - c = this.nextChar(); - } - } - return new Comment(); + return this.discardNextComment(); } // If we're reading a closing tag, return null. This means we've reached diff --git a/test/test-jsdomparser.js b/test/test-jsdomparser.js index 8996c86..bd44fc3 100644 --- a/test/test-jsdomparser.js +++ b/test/test-jsdomparser.js @@ -208,3 +208,42 @@ describe("Test JSDOM functionality", function() { } }); }); + + +describe("Script parsing", function() { + it("should strip ?-based comments within script tags", function() { + var html = ''; + var doc = new JSDOMParser().parse(html); + expect(doc.firstChild.tagName).eql("SCRIPT"); + expect(doc.firstChild.textContent).eql(""); + expect(doc.firstChild.children.length).eql(0); + expect(doc.firstChild.childNodes.length).eql(1); + }); + + it("should strip !-based comments within script tags", function() { + var html = ''; + var doc = new JSDOMParser().parse(html); + expect(doc.firstChild.tagName).eql("SCRIPT"); + expect(doc.firstChild.textContent).eql(""); + expect(doc.firstChild.children.length).eql(0); + expect(doc.firstChild.childNodes.length).eql(1); + }); + + it("should strip any other nodes within script tags", function() { + var html = ""; + var doc = new JSDOMParser().parse(html); + expect(doc.firstChild.tagName).eql("SCRIPT"); + expect(doc.firstChild.textContent).eql("
Hello, I'm not really in a
"); + expect(doc.firstChild.children.length).eql(0); + expect(doc.firstChild.childNodes.length).eql(1); + }); + + it("should not be confused by partial closing tags", function() { + var html = ""; + var doc = new JSDOMParser().parse(html); + expect(doc.firstChild.tagName).eql("SCRIPT"); + expect(doc.firstChild.textContent).eql("var x = ' +
+

Lorem

+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua.

+

Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+

Foo

+
+

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat.

+

Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. + Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
+ +