mercury-parser/scripts/generate-custom-parser.js

/* eslint-disable import/no-extraneous-dependencies */
/* eslint-disable no-use-before-define */
/* eslint-disable no-console */
import fs from 'fs';
import URL from 'url';
import inquirer from 'inquirer';
import ora from 'ora';
import { exec } from 'child_process';

import { stripJunkTags, makeLinksAbsolute } from 'utils/dom';
import Mercury from '../dist/mercury';
import extractorTemplate from './templates/custom-extractor';
import extractorTestTemplate from './templates/custom-extractor-test';

const questions = [
  {
    type: 'input',
    name: 'website',
    message:
      "Paste a url to an article you'd like to create or extend a parser for:",
    validate(value) {
      const { hostname } = URL.parse(value);
      if (hostname) return true;

      return false;
    },
  },
];
let spinner;

function confirm(fn, args, msg, newParser) {
  spinner = ora({ text: msg });
  spinner.start();
  const result = fn(...args);

  if (result && result.then) {
    result.then(r => savePage(r, args, newParser));
  } else {
    spinner.succeed();
  }

  return result;
}

function confirmCreateDir(dir, msg) {
  if (!fs.existsSync(dir)) {
    confirm(fs.mkdirSync, [dir], msg);
  }
}

function getDir(url) {
  const { hostname } = URL.parse(url);
  return `./src/extractors/custom/${hostname}`;
}

function scaffoldCustomParser(url) {
  const dir = getDir(url);
  const { hostname } = URL.parse(url);
  let newParser = false;

  if (!fs.existsSync(dir)) {
    newParser = true;
    confirmCreateDir(dir, `Creating ${hostname} directory`);
    confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
  }

  confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser);
}

// if has arg, just assume that arg is a url and skip prmopt
const urlArg = process.argv[2];
if (urlArg) {
  scaffoldCustomParser(urlArg);
} else {
  inquirer.prompt(questions).then(answers => {
    scaffoldCustomParser(answers.website);
  });
}

function generateScaffold(url, file, result) {
  const { hostname } = URL.parse(url);
  const extractor = extractorTemplate(hostname, extractorName(hostname));
  const extractorTest = extractorTestTemplate(
    file,
    url,
    getDir(url),
    result,
    extractorName(hostname)
  );

  fs.writeFileSync(`${getDir(url)}/index.js`, extractor);
  fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest);
  fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));
  exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`);
}

function savePage($, [url], newParser) {
  const { hostname } = URL.parse(url);

  spinner.succeed();

  const filename = new Date().getTime();
  const file = `./fixtures/${hostname}/${filename}.html`;
  // fix http(s) relative links:
  makeLinksAbsolute($('*').first(), $, url);
  $('[src], [href]').each((index, node) => {
    const $node = $(node);
    const link = $node.attr('src');
    if (link && link.slice(0, 2) === '//') {
      $node.attr('src', `http:${link}`);
    }
  });
  const html = stripJunkTags($('*').first(), $, ['script']).html();

  fs.writeFileSync(file, html);

  Mercury.parse(url, { html }).then(result => {
    if (newParser) {
      confirm(
        generateScaffold,
        [url, file, result],
        'Generating parser and tests'
      );
      console.log(`Your custom site extractor has been set up. To get started building it, run
      yarn watch:test -- ${hostname}
        -- OR --
      npm run watch:test -- ${hostname}`);
    } else {
      console.log(`
  It looks like you already have a custom parser for this url.
  The page you linked to has been added to ${file}. Copy and paste
  the following code to use that page in your tests:
  const html = fs.readFileSync('${file}');`);
    }
  });
}

function exportString(url) {
  const { hostname } = URL.parse(url);
  return `export * from './${hostname}';`;
}

function extractorName(hostname) {
  const name = hostname
    .split('.')
    .map(w => `${w.charAt(0).toUpperCase()}${w.slice(1)}`)
    .join('');
  return `${name}Extractor`;
}