2015-03-19 02:44:47 +00:00
|
|
|
var debug = false;
|
|
|
|
|
|
|
|
var path = require("path");
|
|
|
|
var fs = require("fs");
|
|
|
|
var jsdom = require("jsdom").jsdom;
|
2015-04-21 08:26:15 +00:00
|
|
|
var prettyPrint = require("./utils").prettyPrint;
|
2015-03-19 02:44:47 +00:00
|
|
|
var serializeDocument = require("jsdom").serializeDocument;
|
|
|
|
var http = require("http");
|
2015-04-22 15:16:43 +00:00
|
|
|
var urlparse = require("url").parse;
|
2015-03-19 02:44:47 +00:00
|
|
|
|
2015-04-03 10:29:05 +00:00
|
|
|
var readability = require("../index");
|
|
|
|
var Readability = readability.Readability;
|
|
|
|
var JSDOMParser = readability.JSDOMParser;
|
2015-03-19 02:44:47 +00:00
|
|
|
|
2015-04-22 15:16:43 +00:00
|
|
|
var FFX_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0";
|
|
|
|
|
2015-03-19 02:44:47 +00:00
|
|
|
if (process.argv.length < 3) {
|
|
|
|
console.error("Need at least a destination slug and potentially a URL (if the slug doesn't have source).");
|
|
|
|
process.exit(0);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
var slug = process.argv[2];
|
|
|
|
var url = process.argv[3]; // Could be undefined, we'll warn if it is if that is an issue.
|
|
|
|
|
|
|
|
var destRoot = path.join(__dirname, "test-pages", slug);
|
|
|
|
|
|
|
|
fs.mkdir(destRoot, function(err) {
|
|
|
|
if (err) {
|
|
|
|
var sourceFile = path.join(destRoot, "source.html");
|
|
|
|
fs.exists(sourceFile, function(exists) {
|
|
|
|
if (exists) {
|
|
|
|
fs.readFile(sourceFile, {encoding: "utf-8"}, function(err, data) {
|
|
|
|
if (err) {
|
|
|
|
console.error("Source existed but couldn't be read?");
|
|
|
|
process.exit(1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
onResponseReceived(data);
|
|
|
|
});
|
|
|
|
} else {
|
|
|
|
fetchSource(url, onResponseReceived);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
fetchSource(url, onResponseReceived);
|
|
|
|
});
|
|
|
|
|
|
|
|
function fetchSource(url, callbackFn) {
|
|
|
|
if (!url) {
|
|
|
|
console.error("You should pass a URL if the source doesn't exist yet!");
|
|
|
|
process.exit(1);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
var client = http;
|
|
|
|
if (url.indexOf("https") == 0) {
|
|
|
|
client = require("https");
|
|
|
|
}
|
2015-04-22 15:16:43 +00:00
|
|
|
var options = urlparse(url);
|
|
|
|
options.headers = {'User-Agent': FFX_UA};
|
|
|
|
|
|
|
|
client.get(options, function(response) {
|
2015-03-19 02:44:47 +00:00
|
|
|
if (debug) {
|
|
|
|
console.log("STATUS:", response.statusCode);
|
|
|
|
console.log("HEADERS:", JSON.stringify(response.headers));
|
|
|
|
}
|
|
|
|
response.setEncoding("utf-8");
|
|
|
|
var rv = "";
|
|
|
|
response.on("data", function(chunk) {
|
|
|
|
rv += chunk;
|
|
|
|
});
|
|
|
|
response.on("end", function() {
|
|
|
|
if (debug) {
|
|
|
|
console.log("End received");
|
|
|
|
}
|
2015-03-19 21:18:58 +00:00
|
|
|
// Sanitize:
|
|
|
|
rv = prettyPrint(serializeDocument(jsdom(rv)));
|
2015-03-19 02:44:47 +00:00
|
|
|
callbackFn(rv);
|
|
|
|
});
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
function onResponseReceived(source) {
|
|
|
|
if (debug) {
|
|
|
|
console.log("writing");
|
|
|
|
}
|
|
|
|
var sourcePath = path.join(destRoot, "source.html");
|
|
|
|
fs.writeFile(sourcePath, source, function(err) {
|
|
|
|
if (err) {
|
|
|
|
console.error("Couldn't write data to source.html!");
|
|
|
|
console.error(err);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (debug) {
|
|
|
|
console.log("Running readability stuff");
|
|
|
|
}
|
2015-03-19 21:18:58 +00:00
|
|
|
runReadability(source, path.join(destRoot, "expected.html"), path.join(destRoot, "expected-metadata.json"));
|
2015-03-19 02:44:47 +00:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2015-03-19 21:18:58 +00:00
|
|
|
function runReadability(source, destPath, metadataDestPath) {
|
2015-03-19 02:44:47 +00:00
|
|
|
var doc = new JSDOMParser().parse(source);
|
|
|
|
var uri = {
|
|
|
|
spec: "http://fakehost/test/page.html",
|
|
|
|
host: "fakehost",
|
|
|
|
prePath: "http://fakehost",
|
|
|
|
scheme: "http",
|
2015-04-03 10:29:05 +00:00
|
|
|
pathBase: "http://fakehost/test/"
|
2015-03-19 02:44:47 +00:00
|
|
|
};
|
2015-04-03 10:29:05 +00:00
|
|
|
var readability, result, readerable;
|
2015-03-19 02:44:47 +00:00
|
|
|
try {
|
2015-04-03 10:29:05 +00:00
|
|
|
readability = new Readability(uri, doc);
|
|
|
|
readerable = readability.isProbablyReaderable();
|
|
|
|
result = readability.parse();
|
2015-03-19 02:44:47 +00:00
|
|
|
} catch (ex) {
|
|
|
|
console.error(ex);
|
2015-03-23 14:59:06 +00:00
|
|
|
ex.stack.forEach(console.log.bind(console));
|
2015-03-19 02:44:47 +00:00
|
|
|
}
|
|
|
|
if (!result) {
|
|
|
|
console.error("No content generated by readability, not going to write expected.html!");
|
|
|
|
return;
|
|
|
|
}
|
2015-03-19 21:18:58 +00:00
|
|
|
|
2015-03-19 02:44:47 +00:00
|
|
|
fs.writeFile(destPath, prettyPrint(result.content), function(err) {
|
|
|
|
if (err) {
|
|
|
|
console.error("Couldn't write data to expected.html!");
|
|
|
|
console.error(err);
|
|
|
|
}
|
|
|
|
|
2015-03-19 21:18:58 +00:00
|
|
|
// Delete the result data we don't care about checking.
|
|
|
|
delete result.uri;
|
|
|
|
delete result.content;
|
|
|
|
delete result.length;
|
|
|
|
|
2015-04-03 10:29:05 +00:00
|
|
|
// Add isProbablyReaderable result
|
|
|
|
result.readerable = readerable;
|
|
|
|
|
2015-03-19 21:18:58 +00:00
|
|
|
fs.writeFile(metadataDestPath, JSON.stringify(result, null, 2) + "\n", function(err) {
|
|
|
|
if (err) {
|
|
|
|
console.error("Couldn't write data to expected-metadata.json!");
|
|
|
|
console.error(err);
|
|
|
|
}
|
|
|
|
|
|
|
|
process.exit(0);
|
|
|
|
});
|
2015-03-19 02:44:47 +00:00
|
|
|
});
|
|
|
|
}
|
|
|
|
|