You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

115 lines
3.1 KiB

import fs from 'fs'
import URL from 'url'
import inquirer from 'inquirer'
import ora from 'ora'
import Mercury from '../dist/mercury'
import {
} from 'utils/dom'
import extractorTemplate from './templates/custom-extractor'
import extractorTestTemplate from './templates/custom-extractor-test'
const questions = [
type: 'input',
name: 'website',
message: 'Paste a url to an article you\'d like to create or extend a parser for:',
validate(value) {
const { hostname } = URL.parse(value);
if (hostname) return true;
return false;
inquirer.prompt(questions).then((answers) => {
let spinner;
function confirm(fn, args, msg, newParser) {
spinner = ora({ text: msg });
const result = fn.apply(null, args);
if (result && result.then) {
result.then(r => savePage(r, args, newParser));
} else {
return result;
function savePage($, [url], newParser) {
const { hostname } = URL.parse(url);
const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
// fix http(s) relative links:
makeLinksAbsolute($('*').first(), $, url)
$('[src], [href]').each((index, node) => {
const $node = $(node)
const link = $node.attr('src')
if (link && link.slice(0, 2) === '//') {
$node.attr('src', `http:${link}`)
const html = stripJunkTags($('*').first(), $, ['script']).html();
fs.writeFileSync(file, html);
const result = Mercury.parse(url, html).then((result) => {
if (newParser) {
confirm(generateScaffold, [url, file, result], 'Generating parser and tests');
console.log(`Your custom site extractor has been set up. To get started building it, run
npm run watch:test -- ${hostname}`)
} else {
It looks like you already have a custom parser for this url.
The page you linked to has been added to ${file}. Copy and paste
the following code to use that page in your tests:
const html = fs.readFileSync('${file}');`)
function generateScaffold(url, file, result) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname)
const extractorTest = extractorTestTemplate(file, url, getDir(url), result)
fs.writeFileSync(`${getDir(url)}/index.js`, extractor)
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest)
function confirmCreateDir(dir, msg) {
if (!fs.existsSync(dir)) {
confirm(fs.mkdirSync, [dir], msg);
function scaffoldCustomParser(url) {
const dir = getDir(url);
const { hostname } = URL.parse(url);
let newParser = false
if (!fs.existsSync(dir)) {
newParser = true
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser);
function getDir(url) {
const { hostname } = URL.parse(url);
return `./src/extractors/custom/${hostname}`;