2015-03-13 16:19:15 +00:00
|
|
|
var prettyPrint = require("html").prettyPrint;
|
2015-03-25 13:53:12 +00:00
|
|
|
var jsdom = require("jsdom").jsdom;
|
2015-04-08 14:59:48 +00:00
|
|
|
var serializeDocument = require("jsdom").serializeDocument;
|
2015-03-20 00:01:31 +00:00
|
|
|
var chai = require("chai");
|
|
|
|
chai.config.includeStack = true;
|
|
|
|
var expect = chai.expect;
|
2015-03-13 16:19:15 +00:00
|
|
|
|
2015-04-03 10:29:05 +00:00
|
|
|
var readability = require("../index");
|
2015-04-01 19:56:08 +00:00
|
|
|
var Readability = readability.Readability;
|
|
|
|
var JSDOMParser = readability.JSDOMParser;
|
2015-03-19 02:44:47 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
function runTestsWithItems(label, beforeFn, expectedContent, expectedMetadata) {
|
|
|
|
describe(label, function() {
|
|
|
|
var result;
|
|
|
|
before(function() {
|
|
|
|
result = beforeFn();
|
|
|
|
});
|
2015-04-02 21:35:57 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
it("should return a result object", function() {
|
|
|
|
expect(result).to.include.keys("content", "title", "excerpt", "byline");
|
|
|
|
});
|
2015-04-02 21:35:57 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
it("should extract expected content", function() {
|
|
|
|
expect(expectedContent).eql(prettyPrint(result.content));
|
|
|
|
});
|
2015-03-25 13:53:12 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
it("should extract expected title", function() {
|
|
|
|
expect(expectedMetadata.title).eql(result.title);
|
|
|
|
});
|
2015-03-25 13:53:12 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
it("should extract expected byline", function() {
|
|
|
|
expect(expectedMetadata.byline).eql(result.byline);
|
|
|
|
});
|
2015-03-25 13:53:12 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
it("should extract expected excerpt", function() {
|
|
|
|
expect(expectedMetadata.excerpt).eql(result.excerpt);
|
|
|
|
});
|
|
|
|
|
|
|
|
it("should probably be readerable", function() {
|
2015-04-08 14:59:48 +00:00
|
|
|
expect(result.readerable).eql(true);
|
2015-04-08 14:24:34 +00:00
|
|
|
});
|
2015-03-25 13:53:12 +00:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function removeCommentNodesRecursively(node) {
|
|
|
|
[].forEach.call(node.childNodes, function(child) {
|
|
|
|
if (child.nodeType === child.COMMENT_NODE) {
|
|
|
|
node.removeChild(child);
|
|
|
|
} else if (child.nodeType === child.ELEMENT_NODE) {
|
|
|
|
removeCommentNodesRecursively(child);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
}
|
2015-04-02 21:35:57 +00:00
|
|
|
|
2015-04-08 17:49:33 +00:00
|
|
|
describe("Readability API", function() {
|
|
|
|
describe("#constructor", function() {
|
|
|
|
it("should accept a debug option", function() {
|
|
|
|
expect(new Readability({}, {})._debug).eql(false);
|
|
|
|
expect(new Readability({}, {}, {debug: true})._debug).eql(true);
|
|
|
|
});
|
|
|
|
|
|
|
|
it("should accept a nbTopCandidates option", function() {
|
|
|
|
expect(new Readability({}, {})._nbTopCandidates).eql(5);
|
|
|
|
expect(new Readability({}, {}, {nbTopCandidates: 42})._nbTopCandidates).eql(42);
|
|
|
|
});
|
|
|
|
|
|
|
|
it("should accept a maxPages option", function() {
|
|
|
|
expect(new Readability({}, {})._maxPages).eql(5);
|
|
|
|
expect(new Readability({}, {}, {maxPages: 42})._maxPages).eql(42);
|
|
|
|
});
|
|
|
|
|
|
|
|
it("should accept a maxElemsToParse option", function() {
|
|
|
|
expect(new Readability({}, {})._maxElemsToParse).eql(0);
|
|
|
|
expect(new Readability({}, {}, {maxElemsToParse: 42})._maxElemsToParse).eql(42);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
|
|
|
describe("#parse", function() {
|
|
|
|
it("shouldn't parse oversized documents as per configuration", function() {
|
|
|
|
var doc = new JSDOMParser().parse("<html><div>yo</div></html>");
|
|
|
|
expect(function() {
|
|
|
|
new Readability({}, doc, {maxElemsToParse: 1}).parse();
|
|
|
|
}).to.Throw("Aborting parsing document; 2 elements found");
|
|
|
|
});
|
|
|
|
});
|
|
|
|
});
|
|
|
|
|
2015-04-08 14:59:48 +00:00
|
|
|
var uri = {
|
|
|
|
spec: "http://fakehost/test/page.html",
|
|
|
|
host: "fakehost",
|
|
|
|
prePath: "http://fakehost",
|
|
|
|
scheme: "http",
|
|
|
|
pathBase: "http://fakehost/test/"
|
|
|
|
};
|
|
|
|
|
|
|
|
var jsdomOptions = {
|
|
|
|
features: {
|
|
|
|
FetchExternalResources: false,
|
|
|
|
ProcessExternalResources: false
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
describe("Extraction", function() {
|
|
|
|
require("./bootstrap").getExtractionTestPages().forEach(function(testPage) {
|
2015-03-13 16:19:15 +00:00
|
|
|
describe(testPage.dir, function() {
|
2015-04-08 14:24:34 +00:00
|
|
|
runTestsWithItems("jsdom", function() {
|
2015-04-08 14:59:48 +00:00
|
|
|
var doc = jsdom(testPage.source, jsdomOptions);
|
2015-03-25 13:53:12 +00:00
|
|
|
removeCommentNodesRecursively(doc);
|
|
|
|
var readability = new Readability(uri, doc);
|
|
|
|
var readerable = readability.isProbablyReaderable();
|
|
|
|
var result = readability.parse();
|
|
|
|
result.readerable = readerable;
|
2015-04-08 14:24:34 +00:00
|
|
|
return result;
|
|
|
|
}, testPage.expectedContent, testPage.expectedMetadata);
|
2015-03-20 10:20:49 +00:00
|
|
|
|
2015-04-08 14:24:34 +00:00
|
|
|
runTestsWithItems("JSDOMParser", function() {
|
2015-03-25 13:53:12 +00:00
|
|
|
var doc = new JSDOMParser().parse(testPage.source);
|
|
|
|
var readability = new Readability(uri, doc);
|
|
|
|
var readerable = readability.isProbablyReaderable();
|
|
|
|
var result = readability.parse();
|
|
|
|
result.readerable = readerable;
|
2015-04-08 14:24:34 +00:00
|
|
|
return result;
|
|
|
|
}, testPage.expectedContent, testPage.expectedMetadata);
|
2015-03-13 16:19:15 +00:00
|
|
|
});
|
2015-03-17 19:08:41 +00:00
|
|
|
});
|
2015-03-13 16:19:15 +00:00
|
|
|
});
|
2015-04-08 14:59:48 +00:00
|
|
|
|
|
|
|
describe("Detection", function() {
|
|
|
|
require("./bootstrap").getDetectionTestPages().forEach(function(testPage) {
|
|
|
|
describe(testPage.file, function() {
|
|
|
|
var readerable;
|
|
|
|
|
|
|
|
before(function() {
|
|
|
|
var readability = new Readability(uri, jsdom(testPage.source, jsdomOptions));
|
|
|
|
readerable = readability.isProbablyReaderable();
|
|
|
|
});
|
|
|
|
|
|
|
|
it("should be detected as " + (testPage.readerable ? "readerable" : "non-readerable"),
|
|
|
|
function() {
|
|
|
|
expect(readerable).eql(testPage.readerable);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
});
|
|
|
|
});
|