mercury-parser/scripts/generate-custom-parser.js

98 lines
2.6 KiB
JavaScript
Raw Normal View History

feat: generator for custom parsers and some documentation Squashed commit of the following: commit deaf9e60d031d9ee06e74b8c0895495b187032a5 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 20 10:31:09 2016 -0400 chore: README for custom parsers commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 23:36:09 2016 -0400 draft of readme commit 4f0f463f821465c282ce006378e5d55f8f41df5f Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:56:34 2016 -0400 custom extractor used to build basic parser for theatlantic commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:20:13 2016 -0400 pre-commit to test custom parser generator commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:19:55 2016 -0400 feat: added nytimes parser commit 58b8d83a56927177984ddfdf70830bc4f328f200 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:17:28 2016 -0400 feat: can do fuzzy search or go straight to file commit c99add753723a8e2ac64d51d7379ac8e23125526 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 10:52:26 2016 -0400 refactored export for custom extractors for easier renames commit 22563413669651bb497f1bb2a92085b71f2ae324 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 17:36:13 2016 -0400 feat: custom extractor generation in place commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 16:42:20 2016 -0400 good progress
2016-09-20 14:35:23 +00:00
import fs from 'fs'
import URL from 'url'
import inquirer from 'inquirer'
import ora from 'ora'
import Mercury from '../dist/mercury'
import extractorTemplate from './templates/custom-extractor'
import extractorTestTemplate from './templates/custom-extractor-test'
const questions = [
{
type: 'input',
name: 'website',
message: 'Paste a url to an article you\'d like to create or extend a parser for:',
validate(value) {
const { hostname } = URL.parse(value);
if (hostname) return true;
return false;
},
},
];
inquirer.prompt(questions).then((answers) => {
scaffoldCustomParser(answers.website);
});
let spinner;
function confirm(fn, args, msg, newParser) {
spinner = ora({ text: msg });
spinner.start();
const result = fn.apply(null, args);
if (result && result.then) {
result.then(r => savePage(r, args, newParser));
} else {
spinner.succeed();
}
return result;
}
function savePage($, [url], newParser) {
const { hostname } = URL.parse(url);
spinner.succeed();
const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
fs.writeFileSync(file, $.html());
if (newParser) {
confirm(generateScaffold, [url, file], 'Generating parser and tests');
console.log(`Your custom site extractor has been set up. To get started building it, run
npm test -- ${getDir(url)}/index.test.js`)
} else {
console.log(`It looks like you already have a custom parser for this url.
The page you linked to has been added to ${file}. Copy and paste
the following code to use that page in your tests:
const html = fs.readFileSync('${file}');`)
}
}
function generateScaffold(url, file) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname)
const extractorTest = extractorTestTemplate(file, url, getDir(url))
fs.writeFileSync(`${getDir(url)}/index.js`, extractor)
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest)
}
function confirmCreateDir(dir, msg) {
if (!fs.existsSync(dir)) {
confirm(fs.mkdirSync, [dir], msg);
}
}
function scaffoldCustomParser(url) {
const dir = getDir(url);
const { hostname } = URL.parse(url);
let newParser = false
if (!fs.existsSync(dir)) {
newParser = true
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
}
confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser);
}
function getDir(url) {
const { hostname } = URL.parse(url);
return `./src/extractors/custom/${hostname}`;
}