You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/scripts/generate-custom-parser.js

142 lines
3.8 KiB
JavaScript

/* eslint-disable import/no-extraneous-dependencies */
/* eslint-disable no-use-before-define */
/* eslint-disable no-console */
import fs from 'fs';
import URL from 'url';
import inquirer from 'inquirer';
import ora from 'ora';
import { exec } from 'child_process';
import {
stripJunkTags,
makeLinksAbsolute,
} from 'utils/dom';
import Mercury from '../dist/mercury';
import extractorTemplate from './templates/custom-extractor';
import extractorTestTemplate from './templates/custom-extractor-test';
const questions = [
{
type: 'input',
name: 'website',
message: 'Paste a url to an article you\'d like to create or extend a parser for:',
validate(value) {
const { hostname } = URL.parse(value);
if (hostname) return true;
return false;
},
},
];
let spinner;
function confirm(fn, args, msg, newParser) {
spinner = ora({ text: msg });
spinner.start();
const result = fn(...args);
if (result && result.then) {
result.then(r => savePage(r, args, newParser));
} else {
spinner.succeed();
}
return result;
}
function confirmCreateDir(dir, msg) {
if (!fs.existsSync(dir)) {
confirm(fs.mkdirSync, [dir], msg);
}
}
function getDir(url) {
const { hostname } = URL.parse(url);
return `./src/extractors/custom/${hostname}`;
}
function scaffoldCustomParser(url) {
const dir = getDir(url);
const { hostname } = URL.parse(url);
let newParser = false;
if (!fs.existsSync(dir)) {
newParser = true;
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
}
confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser);
}
inquirer.prompt(questions).then((answers) => {
scaffoldCustomParser(answers.website);
});
function generateScaffold(url, file, result) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname, extractorName(hostname));
const extractorTest =
extractorTestTemplate(
file, url, getDir(url), result, extractorName(hostname)
);
fs.writeFileSync(`${getDir(url)}/index.js`, extractor);
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest);
fs.appendFileSync(
'./src/extractors/custom/index.js',
exportString(url),
);
exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`);
}
function savePage($, [url], newParser) {
const { hostname } = URL.parse(url);
spinner.succeed();
const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
// fix http(s) relative links:
makeLinksAbsolute($('*').first(), $, url);
$('[src], [href]').each((index, node) => {
const $node = $(node);
const link = $node.attr('src');
if (link && link.slice(0, 2) === '//') {
$node.attr('src', `http:${link}`);
}
});
const html = stripJunkTags($('*').first(), $, ['script']).html();
fs.writeFileSync(file, html);
Mercury.parse(url, html).then((result) => {
if (newParser) {
confirm(generateScaffold, [url, file, result], 'Generating parser and tests');
console.log(`Your custom site extractor has been set up. To get started building it, run
yarn watch:test -- ${hostname}
-- OR --
npm run watch:test -- ${hostname}`);
} else {
console.log(`
It looks like you already have a custom parser for this url.
The page you linked to has been added to ${file}. Copy and paste
the following code to use that page in your tests:
const html = fs.readFileSync('${file}');`);
}
});
}
function exportString(url) {
const { hostname } = URL.parse(url);
return `export * from './${hostname}';`;
}
function extractorName(hostname) {
const name = hostname
.split('.')
.map(w => `${w.charAt(0).toUpperCase()}${w.slice(1)}`)
.join('');
return `${name}Extractor`;
}