var jsdom = require("jsdom").jsdom; var chai = require("chai"); chai.config.includeStack = true; var expect = chai.expect; var readability = require("../index"); var Readability = readability.Readability; var JSDOMParser = readability.JSDOMParser; var testPages = require("./utils").getTestPages(); function reformatError(err) { var formattedError = new Error(err.message); formattedError.stack = err.stack; return formattedError; } function inOrderTraverse(fromNode) { if (fromNode.firstChild) { return fromNode.firstChild; } while (fromNode && !fromNode.nextSibling) { fromNode = fromNode.parentNode; } return fromNode ? fromNode.nextSibling : null; } function inOrderIgnoreEmptyTextNodes(fromNode) { do { fromNode = inOrderTraverse(fromNode); } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim()); return fromNode; } function traverseDOM(callback, expectedDOM, actualDOM) { var actualNode = actualDOM.documentElement || actualDOM.childNodes[0]; var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0]; while (actualNode) { if (!callback(actualNode, expectedNode)) { break; } actualNode = inOrderIgnoreEmptyTextNodes(actualNode); expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode); } } // Collapse subsequent whitespace like HTML: function htmlTransform(str) { return str.replace(/\s+/g, " "); } function runTestsWithItems(label, domGenerationFn, uri, source, expectedContent, expectedMetadata) { describe(label, function() { this.timeout(10000); var result; before(function() { try { var doc = domGenerationFn(source); var myReader = new Readability(uri, doc); // Needs querySelectorAll function to test isProbablyReaderable method. // jsdom implements querySelector but JSDOMParser doesn't. var readerable = label === "jsdom" ? myReader.isProbablyReaderable() : null; result = myReader.parse(); result.readerable = readerable; } catch (err) { throw reformatError(err); } }); it("should return a result object", function() { expect(result).to.include.keys("content", "title", "excerpt", "byline"); }); it("should extract expected content", function() { function nodeStr(n) { if (n.nodeType == 3) { return "#text(" + htmlTransform(n.textContent) + ")"; } if (n.nodeType != 1) { return "some other node type: " + n.nodeType + " with data " + n.data; } var rv = n.localName; if (n.id) { rv += "#" + n.id; } if (n.className) { rv += ".(" + n.className + ")"; } return rv; } function genPath(node) { if (node.id) { return '#' + node.id; } if (node.tagName == "BODY") { return 'body'; } var parent = node.parentNode; var parentPath = genPath(parent); var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1; return parentPath + " > " + nodeStr(node) + ":nth-child(" + index + ")"; } function findableNodeDesc(node) { return genPath(node) + "(in: ``" + node.parentNode.innerHTML + "``)"; } var actualDOM = domGenerationFn(result.content); var expectedDOM = domGenerationFn(expectedContent); traverseDOM(function(actualNode, expectedNode) { expect(!!actualNode).eql(!!expectedNode); if (actualNode && expectedNode) { var actualDesc = nodeStr(actualNode); var expectedDesc = nodeStr(expectedNode); if (actualDesc != expectedDesc) { expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc); return false; } // Compare text for text nodes: if (actualNode.nodeType == 3) { var actualText = htmlTransform(actualNode.textContent); var expectedText = htmlTransform(expectedNode.textContent); expect(actualText, findableNodeDesc(actualNode)).eql(expectedText); if (actualText != expectedText) { return false; } // Compare attributes for element nodes: } else if (actualNode.nodeType == 1) { expect(actualNode.attributes.length).eql(expectedNode.attributes.length); for (var i = 0; i < actualNode.attributes.length; i++) { var attr = actualNode.attributes[i].name; var actualValue = actualNode.getAttribute(attr); var expectedValue = expectedNode.getAttribute(attr); expect(expectedValue, "node (" + findableNodeDesc(actualNode) + ") attribute " + attr + " should match").eql(actualValue); } } } else { return false; } return true; }, actualDOM, expectedDOM); }); it("should extract expected title", function() { expect(expectedMetadata.title).eql(result.title); }); it("should extract expected byline", function() { expect(expectedMetadata.byline).eql(result.byline); }); it("should extract expected excerpt", function() { expect(expectedMetadata.excerpt).eql(result.excerpt); }); expectedMetadata.dir && it("should extract expected direction", function() { expect(expectedMetadata.dir).eql(result.dir); }); label === "jsdom" && it("should probably be readerable", function() { expect(expectedMetadata.readerable).eql(result.readerable); }); }); } function removeCommentNodesRecursively(node) { for (var i = node.childNodes.length - 1; i >= 0; i--) { var child = node.childNodes[i]; if (child.nodeType === child.COMMENT_NODE) { node.removeChild(child); } else if (child.nodeType === child.ELEMENT_NODE) { removeCommentNodesRecursively(child); } } } describe("Readability API", function() { describe("#constructor", function() { it("should accept a debug option", function() { expect(new Readability({}, {})._debug).eql(false); expect(new Readability({}, {}, {debug: true})._debug).eql(true); }); it("should accept a nbTopCandidates option", function() { expect(new Readability({}, {})._nbTopCandidates).eql(5); expect(new Readability({}, {}, {nbTopCandidates: 42})._nbTopCandidates).eql(42); }); it("should accept a maxPages option", function() { expect(new Readability({}, {})._maxPages).eql(5); expect(new Readability({}, {}, {maxPages: 42})._maxPages).eql(42); }); it("should accept a maxElemsToParse option", function() { expect(new Readability({}, {})._maxElemsToParse).eql(0); expect(new Readability({}, {}, {maxElemsToParse: 42})._maxElemsToParse).eql(42); }); }); describe("#parse", function() { it("shouldn't parse oversized documents as per configuration", function() { var doc = new JSDOMParser().parse("
yo
"); expect(function() { new Readability({}, doc, {maxElemsToParse: 1}).parse(); }).to.Throw("Aborting parsing document; 2 elements found"); }); }); }); describe("Test pages", function() { testPages.forEach(function(testPage) { describe(testPage.dir, function() { var uri = { spec: "http://fakehost/test/page.html", host: "fakehost", prePath: "http://fakehost", scheme: "http", pathBase: "http://fakehost/test/" }; runTestsWithItems("jsdom", function(source) { var doc = jsdom(source, { features: { FetchExternalResources: false, ProcessExternalResources: false } }); removeCommentNodesRecursively(doc); return doc; }, uri, testPage.source, testPage.expectedContent, testPage.expectedMetadata); runTestsWithItems("JSDOMParser", function(source) { var parser = new JSDOMParser(); var doc = parser.parse(source); if (parser.errorState) { console.error("Parsing this DOM caused errors:", parser.errorState); return null; } return doc; }, uri, testPage.source, testPage.expectedContent, testPage.expectedMetadata); }); }); });