feat: parser auto-generates name; lint is more specific

pull/17/head
Adam Pash 8 years ago
parent 65c641a879
commit 048d654417

@ -6,6 +6,7 @@ var fs = _interopDefault(require('fs'));
var URL = _interopDefault(require('url'));
var inquirer = _interopDefault(require('inquirer'));
var ora = _interopDefault(require('ora'));
var child_process = require('child_process');
var regenerator = _interopDefault(require('babel-runtime/regenerator'));
var _extends = _interopDefault(require('babel-runtime/helpers/extends'));
var asyncToGenerator = _interopDefault(require('babel-runtime/helpers/asyncToGenerator'));
@ -1354,14 +1355,14 @@ function template(strings) {
}).join('\n');
}
var _templateObject = taggedTemplateLiteral(['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'', '\',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n '], ['\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n export const CustomExtractor = {\n domain: \'', '\',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n ']);
var _templateObject = taggedTemplateLiteral(['\n export const ', ' = {\n domain: \'', '\',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n '], ['\n export const ', ' = {\n domain: \'', '\',\n\n title: {\n selectors: [\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n // enter author selectors\n ],\n },\n\n date_published: {\n selectors: [\n // enter selectors\n ],\n },\n\n dek: {\n selectors: [\n // enter selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n // enter selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it\'s consumable content? E.g., unusual lazy loaded images\n transforms: {\n },\n\n // Is there anything that is in the result that shouldn\'t be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ]\n },\n }\n ']);
function extractorTemplate (hostname) {
return template(_templateObject, hostname);
function extractorTemplate (hostname, name) {
return template(_templateObject, name, hostname);
}
var _templateObject$1 = taggedTemplateLiteral(['\n it(\'returns the ', '\', async () => {\n // To pass this test, fill out the ', ' selector\n // in ', '/index.js.\n const html =\n fs.readFileSync(\'', '\');\n const articleUrl =\n \'', '\';\n\n const { ', ' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(', ', ', ')\n });\n '], ['\n it(\'returns the ', '\', async () => {\n // To pass this test, fill out the ', ' selector\n // in ', '/index.js.\n const html =\n fs.readFileSync(\'', '\');\n const articleUrl =\n \'', '\';\n\n const { ', ' } =\n await Mercury.parse(articleUrl, html, { fallback: false });\n\n // Update these values with the expected values from\n // the article.\n assert.equal(', ', ', ')\n });\n ']);
var _templateObject2 = taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'', '\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ', '\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ', '/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'', '\');\n const url =\n \'', '\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n '], ['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n // Rename CustomExtractor\n // to fit your publication\n // (e.g., NYTimesExtractor)\n describe(\'CustomExtractor\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'', '\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ', '\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ', '/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'', '\');\n const url =\n \'', '\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n ']);
var _templateObject2 = taggedTemplateLiteral(['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n describe(\'', '\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'', '\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ', '\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ', '/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'', '\');\n const url =\n \'', '\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n '], ['\n import assert from \'assert\';\n import fs from \'fs\';\n import URL from \'url\';\n import cheerio from \'cheerio\';\n\n import Mercury from \'mercury\';\n import getExtractor from \'extractors/get-extractor\';\n import { excerptContent } from \'utils/text\';\n\n describe(\'', '\', () => {\n it(\'is selected properly\', () => {\n // This test should be passing by default.\n // It sanity checks that the correct parser\n // is being selected for URLs from this domain\n const url =\n \'', '\';\n const extractor = getExtractor(url);\n assert.equal(extractor.domain, URL.parse(url).hostname)\n })\n\n ', '\n\n it(\'returns the content\', async () => {\n // To pass this test, fill out the content selector\n // in ', '/index.js.\n // You may also want to make use of the clean and transform\n // options.\n const html =\n fs.readFileSync(\'', '\');\n const url =\n \'', '\';\n\n const { content } =\n await Mercury.parse(url, html, { fallback: false });\n\n const $ = cheerio.load(content || \'\');\n\n const first13 = excerptContent($(\'*\').first().text(), 13)\n\n // Update these values with the expected values from\n // the article.\n assert.equal(first13, \'Add the first 13 words of the article here\');\n });\n });\n ']);
var IGNORE = ['url', 'domain', 'content', 'word_count', 'next_page_url', 'excerpt', 'direction', 'total_pages', 'rendered_pages'];
function testFor(key, value, dir, file, url) {
@ -1372,8 +1373,8 @@ function testFor(key, value, dir, file, url) {
return template(_templateObject$1, key, key, dir, file, url, key, key, value ? "`" + value + "`" : "''");
}
function extractorTestTemplate (file, url, dir, result) {
return template(_templateObject2, url, Reflect.ownKeys(result).map(function (k) {
function extractorTestTemplate (file, url, dir, result, name) {
return template(_templateObject2, name, url, Reflect.ownKeys(result).map(function (k) {
return testFor(k, result[k], dir, file, url);
}).join('\n\n'), dir, file, url);
}
@ -1456,12 +1457,20 @@ function generateScaffold(url, file, result) {
var hostname = _URL$parse3.hostname;
var extractor = extractorTemplate(hostname);
var extractorTest = extractorTestTemplate(file, url, getDir(url), result);
var extractor = extractorTemplate(hostname, extractorName(hostname));
var extractorTest = extractorTestTemplate(file, url, getDir(url), result, extractorName(hostname));
fs.writeFileSync(getDir(url) + '/index.js', extractor);
fs.writeFileSync(getDir(url) + '/index.test.js', extractorTest);
fs.appendFileSync('./src/extractors/custom/index.js', exportString(url));
child_process.exec('npm run lint-fix-quiet -- ' + getDir(url) + '/*.js');
}
function extractorName(hostname) {
var name = hostname.split('.').map(function (w) {
return '' + w.charAt(0).toUpperCase() + w.slice(1);
}).join('');
return name + 'Extractor';
}
function exportString(url) {
@ -1469,7 +1478,7 @@ function exportString(url) {
var hostname = _URL$parse4.hostname;
return 'export * from \'./' + hostname + '\'';
return 'export * from \'./' + hostname + '\';';
}
function confirmCreateDir(dir, msg) {

File diff suppressed because one or more lines are too long

@ -5,12 +5,13 @@
"main": "./dist/mercury.js",
"scripts": {
"lint": "eslint src/** --fix",
"lint-fix-quiet": "eslint --fix --quiet",
"build": "eslint src/** --fix && rollup -c",
"build-generator": "rollup -c scripts/rollup.config.js",
"test_build": "rollup -c",
"test": "jest",
"watch:test": "jest --watch --noStackTrace",
"generate-parser": "node ./dist/generate-custom-parser.js && eslint src/extractors/custom/**/*.js --fix --quiet > /dev/null",
"generate-parser": "node ./dist/generate-custom-parser.js",
"add-contributor": "all-contributors add",
"generate-contributors": "all-contributors generate"
},

@ -2,6 +2,7 @@ import fs from 'fs'
import URL from 'url'
import inquirer from 'inquirer'
import ora from 'ora'
import { exec } from 'child_process'
import Mercury from '../dist/mercury'
import {
@ -81,8 +82,8 @@ function savePage($, [url], newParser) {
function generateScaffold(url, file, result) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname)
const extractorTest = extractorTestTemplate(file, url, getDir(url), result)
const extractor = extractorTemplate(hostname, extractorName(hostname))
const extractorTest = extractorTestTemplate(file, url, getDir(url), result, extractorName(hostname))
fs.writeFileSync(`${getDir(url)}/index.js`, extractor)
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest)
@ -90,11 +91,20 @@ function generateScaffold(url, file, result) {
'./src/extractors/custom/index.js',
exportString(url),
)
exec(`npm run lint-fix-quiet -- ${getDir(url)}/*.js`)
}
function extractorName(hostname) {
const name = hostname
.split('.')
.map(w => `${w.charAt(0).toUpperCase()}${w.slice(1)}`)
.join('')
return `${name}Extractor`
}
function exportString(url) {
const { hostname } = URL.parse(url);
return `export * from './${hostname}'`;
return `export * from './${hostname}';`;
}
function confirmCreateDir(dir, msg) {

@ -34,7 +34,7 @@ function testFor(key, value, dir, file, url) {
`;
}
export default function (file, url, dir, result) {
export default function (file, url, dir, result, name) {
return template`
import assert from 'assert';
import fs from 'fs';
@ -45,10 +45,7 @@ export default function (file, url, dir, result) {
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
describe('CustomExtractor', () => {
describe('${name}', () => {
it('is selected properly', () => {
// This test should be passing by default.
// It sanity checks that the correct parser

@ -1,11 +1,8 @@
import template from './index';
export default function (hostname) {
export default function (hostname, name) {
return template`
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
export const CustomExtractor = {
export const ${name} = {
domain: '${hostname}',
title: {

@ -188,84 +188,11 @@ npm run watch:test -- www.newyorker.com
This will run the tests for the parser you just generated, which should fail (which makes sense — you haven't written it yet!). Your goal now is to follow the instructions in the generated `www.newyorker.com/index.test.js` and `www.newyorker.com/index.js` files until they pass!
### Step 2: Passing your first test
### Step 2: Passing your first test: Title extraction
If you look at your parser's test file, you'll see a few instructions to guide you in making your tests pass. The first comment at the top of our test file is simple: It instructs you to rename CustomExtractor. In our case, we're going to rename it to NewYorkerExtractor.
If you look at your parser's test file, you'll see a few instructions to guide you in polishing your parser and making your tests pass.
So, from this:
```javascript
// Rename CustomExtractor
describe('CustomExtractor', () => {
...
```
...to this:
```javascript
describe('NewYorkerExtractor', () => {
...
```
That was easy, but when you save the file you may notice that this didn't make your test pass. So let's look more closely at the first test:
```javascript
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ./src/extractors/custom/www.newyorker.com/index.js
// (e.g., CustomExtractor => NYTimesExtractor)
// then add your new extractor to
// src/extractors/all.js
const url =
'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
```
This test checks to see whether or not Mercury can find your custom parser. In our example, it ensures that Mercury correctly selects our custom parser when it's parsing a link from `http://www.newyorker.com`. At the very top of the test, you'll see the following instructions:
```javascript
// To pass this test, rename your extractor in
// ./src/extractors/custom/www.newyorker.com/index.js
// (e.g., CustomExtractor => NYTimesExtractor)
// then add your new extractor to
// src/extractors/all.js
```
So let's follow those instructions. In [`www.newyorker.com/index.js`](./www.newyorker.com/index.js), we're going to rename `CustomExtractor` to `NewYorkerExtractor`, just like we did in the test. The top of our custom parser should now look like this:
```javascript
export const NewYorkerExtractor = {
domain: 'www.newyorker.com',
...
}
```
Save the file, and you'll notice your test still isn't passing. If we refer back to the instructions above, we'll see that we need to add our new extractor to [`src/extractors/all.js`](../all.js). So let's do that. First, we need to add the following import to the rest of the imports at top of the file:
```javascript
import { NewYorkerExtractor } from './custom/www.newyorker.com';
```
Next, we need to add our new custom extractor to the Extractors object. The key should be your site's domain (in our case, `www.newyorker.com`), and the value should be the extractor we imported above. So, it should look like this:
```javascript
const Extractors = {
...
'www.newyorker.com': NewYorkerExtractor,
};
```
When you save your changes, you'll notice that your first test is now passing — congrats! Now it's time to move onto your next test.
### Step 3: Performing a simple title extraction
The next test checks to see whether your extractor returns the correct title:
By default, the first test, which ensures your custom extractor is being selected properly, should be passing. The first failing test checks to see whether your extractor returns the correct title:
```javascript
it('returns the title', (async) () => {
@ -325,7 +252,7 @@ AssertionError: 'Hacking, Cryptography, and the Countdown to Quantum Computing'
When Mercury generated our test, it took a guess at the page's title, and in this case, it got it wrong. So update the test with thte title we expect, save it, and your test should pass!
### Step 4: Speed it up
### Step 3: Speed it up
We've been moving at a slow pace, but as you can see, once you understand the basics, extracting most items on the page is actually very easy. For example, if you follow the same instructions to find the author selector, you'll find that the `.contributors` selector will return the correct author (Alex Hutchinson).
@ -351,7 +278,7 @@ As [explained above](#selecting-an-attribute), to return an attribute rather tha
You can refer to the [NewYorkerExtractor](www.newyorker.com/index.js) to see more the rest of the basic selectors.
### Step 5: Content extraction
### Step 4: Content extraction
I've left content extraction for last, since it's often the trickiest, sometimes requiring special passes to [clean](#cleaning-content) and [transform](#using-tranforms) the content. For the New Yorker, the first part is easy: The selector for this page is clearly `div#articleBody`. But that's just our first step, because unlike the other tests, where we want to make sure we're matching a simple string, we need to sanity check that the page looks good when it's rendered, and that there aren't any elements returned by our selector that we don't want.

Loading…
Cancel
Save