feat: generator for custom parsers and some documentation

Squashed commit of the following:

commit deaf9e60d031d9ee06e74b8c0895495b187032a5
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 20 10:31:09 2016 -0400

    chore: README for custom parsers

commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 23:36:09 2016 -0400

    draft of readme

commit 4f0f463f821465c282ce006378e5d55f8f41df5f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:56:34 2016 -0400

    custom extractor used to build basic parser for theatlantic

commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:20:13 2016 -0400

    pre-commit to test custom parser generator

commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:19:55 2016 -0400

    feat: added nytimes parser

commit 58b8d83a56927177984ddfdf70830bc4f328f200
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 17:17:28 2016 -0400

    feat: can do fuzzy search or go straight to file

commit c99add753723a8e2ac64d51d7379ac8e23125526
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 19 10:52:26 2016 -0400

    refactored export for custom extractors for easier renames

commit 22563413669651bb497f1bb2a92085b71f2ae324
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 17:36:13 2016 -0400

    feat: custom extractor generation in place

commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 16 16:42:20 2016 -0400

    good progress
pull/5/head
Adam Pash 8 years ago
parent c4f06c7ebc
commit 8f42e119e8

1
.gitignore vendored

@ -2,7 +2,6 @@ node_modules
build
npm-debug.log
TODO.md
fixtures
read
preview.html
preview.json

@ -43,7 +43,7 @@ My goal is be to create declarative extractors that describe what rather than ho
```javascript
NYMagExtractor = {
content: {
// Order by most likely. Extractor will stop on first occurence
// Order by most likely. Extractor will stop on first occurrence
selectors: [
'div.article-content',
'section.body',

@ -34,3 +34,7 @@ The result looks like this:
```
If Mercury is unable to find a field, that field will return `null`.
## Contributing
If you'd like to write a custom parser for a site, [here's how](src/extractors/custom/README.md).

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

80
dist/mercury.js vendored

@ -501,11 +501,59 @@ var TwitterExtractor = {
};
var NYTimesExtractor = {
title: {
selectors: ['.g-headline', 'h1.headline']
},
author: {
selectors: ['.g-byline', '.byline']
},
content: {
selectors: ['div.g-blocks', 'article#story'],
defaultCleaner: false,
transforms: {
'img.g-lazy': function imgGLazy($node) {
var src = $node.attr('src');
// const widths = $node.attr('data-widths')
// .slice(1)
// .slice(0, -1)
// .split(',');
// if (widths.length) {
// width = widths.slice(-1);
// } else {
// width = '900';
// }
var width = 640;
src = src.replace('{{size}}', width);
$node.attr('src', src);
}
},
clean: ['.ad', 'header#story-header', '.story-body-1 .lede.video', '.visually-hidden', '#newsletter-promo', '.promo', '.comments-button', '.hidden']
},
date_published: null,
lead_image_url: null,
dek: null,
next_page_url: null,
excerpt: null
};
var Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor,
'twitter.com': TwitterExtractor
'twitter.com': TwitterExtractor,
'www.nytimes.com': NYTimesExtractor
};
// Spacer images to be removed
@ -3207,7 +3255,7 @@ var ATTR_RE = /\[([\w-]+)\]/;
function cleanBySelectors($content, $, _ref) {
var clean = _ref.clean;
if (!clean) return null;
if (!clean) return $content;
$(clean.join(','), $content).remove();
@ -3218,7 +3266,7 @@ function cleanBySelectors($content, $, _ref) {
function transformElements($content, $, _ref2) {
var transforms = _ref2.transforms;
if (!transforms) return null;
if (!transforms) return $content;
_Reflect$ownKeys(transforms).forEach(function (key) {
var $matches = $(key, $content);
@ -3553,6 +3601,32 @@ var Mercury = {
}
}, _callee, _this);
}))();
},
// A convenience method for getting a resource
// to work with, e.g., for custom extractor generator
fetchResource: function fetchResource(url) {
var _this2 = this;
return asyncToGenerator(regeneratorRuntime.mark(function _callee2() {
return regeneratorRuntime.wrap(function _callee2$(_context2) {
while (1) {
switch (_context2.prev = _context2.next) {
case 0:
_context2.next = 2;
return Resource.create(url);
case 2:
return _context2.abrupt('return', _context2.sent);
case 3:
case 'end':
return _context2.stop();
}
}
}, _callee2, _this2);
}))();
}
};

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -7,8 +7,10 @@
"start": "node ./build",
"lint": "eslint src/** --fix",
"build": "eslint src/** --fix && rollup -c",
"build-generator": "rollup -c scripts/rollup.config.js",
"test_build": "rollup -c",
"test": "./test-runner"
"test": "./test-runner",
"generate-custom-parser": "node ./dist/generate-custom-parser.js"
},
"author": "",
"license": "ISC",
@ -33,10 +35,13 @@
"eslint-plugin-import": "^1.15.0",
"eslint-plugin-jsx-a11y": "^2.2.2",
"eslint-plugin-react": "^6.2.1",
"inquirer": "^1.1.3",
"mocha": "^3.0.2",
"ora": "^0.3.0",
"rollup": "^0.34.13",
"rollup-plugin-babel": "^2.6.1",
"rollup-plugin-commonjs": "^4.1.0"
"rollup-plugin-commonjs": "^4.1.0",
"rollup-plugin-multi-entry": "^2.0.1"
},
"dependencies": {
"babel-polyfill": "^6.13.0",

@ -0,0 +1,97 @@
import fs from 'fs'
import URL from 'url'
import inquirer from 'inquirer'
import ora from 'ora'
import Mercury from '../dist/mercury'
import extractorTemplate from './templates/custom-extractor'
import extractorTestTemplate from './templates/custom-extractor-test'
const questions = [
{
type: 'input',
name: 'website',
message: 'Paste a url to an article you\'d like to create or extend a parser for:',
validate(value) {
const { hostname } = URL.parse(value);
if (hostname) return true;
return false;
},
},
];
inquirer.prompt(questions).then((answers) => {
scaffoldCustomParser(answers.website);
});
let spinner;
function confirm(fn, args, msg, newParser) {
spinner = ora({ text: msg });
spinner.start();
const result = fn.apply(null, args);
if (result && result.then) {
result.then(r => savePage(r, args, newParser));
} else {
spinner.succeed();
}
return result;
}
function savePage($, [url], newParser) {
const { hostname } = URL.parse(url);
spinner.succeed();
const filename = new Date().getTime();
const file = `./fixtures/${hostname}/${filename}.html`;
fs.writeFileSync(file, $.html());
if (newParser) {
confirm(generateScaffold, [url, file], 'Generating parser and tests');
console.log(`Your custom site extractor has been set up. To get started building it, run
npm test -- ${getDir(url)}/index.test.js`)
} else {
console.log(`It looks like you already have a custom parser for this url.
The page you linked to has been added to ${file}. Copy and paste
the following code to use that page in your tests:
const html = fs.readFileSync('${file}');`)
}
}
function generateScaffold(url, file) {
const { hostname } = URL.parse(url);
const extractor = extractorTemplate(hostname)
const extractorTest = extractorTestTemplate(file, url, getDir(url))
fs.writeFileSync(`${getDir(url)}/index.js`, extractor)
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest)
}
function confirmCreateDir(dir, msg) {
if (!fs.existsSync(dir)) {
confirm(fs.mkdirSync, [dir], msg);
}
}
function scaffoldCustomParser(url) {
const dir = getDir(url);
const { hostname } = URL.parse(url);
let newParser = false
if (!fs.existsSync(dir)) {
newParser = true
confirmCreateDir(dir, `Creating ${hostname} directory`);
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
}
confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser);
}
function getDir(url) {
const { hostname } = URL.parse(url);
return `./src/extractors/custom/${hostname}`;
}

@ -0,0 +1,14 @@
import babel from 'rollup-plugin-babel';
import babelrc from 'babelrc-rollup';
import commonjs from 'rollup-plugin-commonjs';
export default {
entry: './scripts/generate-custom-parser.js',
plugins: [
commonjs(),
babel(babelrc()),
],
format: 'cjs',
dest: 'dist/generate-custom-parser.js', // equivalent to --output
sourceMap: true,
};

@ -0,0 +1,48 @@
import template from './index';
export default function (file, url, dir) {
return template`
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
// Rename CustomExtractor
describe('CustomExtractor', () => {
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ${dir}/index.js
// then add your new extractor to
// src/extractors/all.js
const url = '${url}';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname)
})
it('works with a starter story', (async) () => {
// To pass this test, begin filling out your
// selectors in ${dir}/index.js. This test is just
// a stub; you can add more fields to test as much of
// your parser as possible.
const html = fs.readFileSync('${file}');
const uri = '${url}';
const { content, title, author } = await Mercury.parse(uri, html);
const $ = cheerio.load(content);
const text = $('*').first()
.text()
.trim()
.slice(0, 20);
// Update these values with the expected values from
// the article.
assert.equal(title, '');
assert.equal(author, '');
assert.equal(text, '');
});
});
`;
}

@ -0,0 +1,44 @@
import template from './index';
export default function (hostname) {
return template`
// Rename CustomExtractor
// to fit your publication
export const CustomExtractor = {
domain: '${hostname}',
title: {
selectors: [
// enter title selectors
],
},
content: {
selectors: [
// enter content selectors
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [
],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
]
},
date_published: null,
lead_image_url: null,
dek: null,
next_page_url: null,
excerpt: null,
}
`;
}

@ -0,0 +1,28 @@
import insertValues from './insert-values'
const bodyPattern = /^\n([\s\S]+)\s{2}$/gm;
const trailingWhitespace = /\s+$/;
export default function template(strings, ...values) {
const compiled = insertValues(strings, ...values);
let [body] = compiled.match(bodyPattern) || [];
let indentLevel = /^\s{0,4}(.+)$/g;
if (!body) {
body = compiled;
indentLevel = /^\s{0,2}(.+)$/g;
}
return body.split('\n')
.slice(1)
.map((line) => {
line = line.replace(indentLevel, '$1');
if (trailingWhitespace.test(line)) {
line = line.replace(trailingWhitespace, '');
}
return line;
})
.join('\n');
}

@ -0,0 +1,17 @@
export default function insertValues(strings, ...values) {
if (values.length) {
return strings.reduce((result, part, idx) => {
let value = values[idx];
if (value && typeof value.toString === 'function') {
value = value.toString();
} else {
value = '';
}
return result + part + value;
}, '');
}
return strings.join('');
}

@ -1,13 +1,17 @@
import NYMagExtractor from './custom/nymag.com';
import BloggerExtractor from './custom/blogspot.com';
import WikipediaExtractor from './custom/wikipedia.org';
import TwitterExtractor from './custom/twitter.com';
import { NYMagExtractor } from './custom/nymag.com';
import { BloggerExtractor } from './custom/blogspot.com';
import { WikipediaExtractor } from './custom/wikipedia.org';
import { TwitterExtractor } from './custom/twitter.com';
import { NYTimesExtractor } from './custom/www.nytimes.com';
import { TheAtlanticExtractor } from './custom/www.theatlantic.com';
const Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
'wikipedia.org': WikipediaExtractor,
'twitter.com': TwitterExtractor,
'www.nytimes.com': NYTimesExtractor,
'www.theatlantic.com': TheAtlanticExtractor,
};
export default Extractors;

@ -0,0 +1,137 @@
# Custom Parsers
Mercury can extract meaningful content from almost any web site, but custom parsers allow the Mercury parser to find the content more quickly and more accurately than it might otherwise do. Our goal is to include custom parsers as many sites as we can, and we'd love your help!
## How to generate a custom parser
Take a look at the live custom parsers in [`src/extractors/custom`](/src/extractors/custom) for examples and to check if the site you want to write a parser for already exists.
To generate a new custom parser, run:
```bash
npm run generate-custom-parser
```
This script will prompt you to paste a link to an article you want to parse. The URL you choose will serve as the example your parser tests against. The script will also generate your custom parser and a barebones (and failing) test for your parser.
At that point, you'll be prompted to run:
```bash
npm test
```
This will run the test for your parser, which will fail (which makes sense — you haven't written it yet!). Your goal now is to follow the instructions in the generated `<example.com>/index.test.js` and `<example.com>/index.js` files until they pass!
## How to write a custom parser
Custom parsers allow you to write CSS selectors that will find the content you're looking for on the page you're testing against. If you're familiar with jQuery, the selectors work exactly the same way.
You can query for every field returned by the Mercury Parser:
- title
- author
- content
- date_published
- lead_image_url
- dek
- next_page_url
- excerpt
### Using selectors
To demonstrate, let's start with something simple: Your selector for the page's title might look something like this:
```javascript
export const ExampleExtractor = {
...
// Order by most likely. Extractor will stop on first occurrence
title: {
selectors: [
'h1.hed',
],
},
...
```
As you might guess, the selectors key provides an array of selectors that Mercury will check to find your title text. In our ExampleExtractor, we're saying that the title can be found in the text of an `h1` header with a class name of `hed`.
The selector you choose should return one element. If more than one element is returned by your selector, it will fail (and Mercury will fall back to its generic extractor).
This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception.
### Cleaning content
An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.
For example, sometimes an article's content will contain related content that doesn't translate or render well when you just want to see the article's content. The clean key allows you to provide an array of selectors identifying elements that should be removed from the content.
Here's an example:
```javascript
export const ExampleExtractor = {
...
content: {
selectors: [
'div.g-blocks',
'article#story',
],
// Selectors to remove from the extracted content
clean: [
'.related',
'.hidden',
],
}
...
}
```
### Using transforms
Occasionally, in order to mold the article content to a form that's readable outside the page, you need to transform a few elements inside the content you've chosen. That's where `transforms` come in.
This example demonstrates a simple tranform that converts h1 headers to h2 headers, along with a more complex transform that transforms lazy-loaded images to images that will render as you would expect outside the context of the site you're extracting from.
```javascript
export const ExampleExtractor = {
...
content: {
selectors: [
'div.article-content',
],
transforms: {
// In a simple tranform, each key is the selector,
// and the value, provided it's a string, represents
// the tag that the matched item should be transformed to.
// Convert h1s to h2s
h1: 'h2',
// If a function is given as the value, it should return a string
// to convert to or nothing (in which case it will not perform
// the transformation.
// Convert lazy-loaded noscript images to figures
noscript: ($node) => {
const $children = $node.children();
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure';
}
return null;
},
},
},
```
For much more complex tranforms, you can perform dom manipulation within the tranform function, but this is discouraged unless absolutely necessary. See, for example, the lazy-loaded image transform in [the NYTimesExtractor](www.nytimes.com/index.js#L25), which transforms the src attribute on the lazy-loaded image.
## Submitting a custom extractor
If you've written a custom extractor, please send us a pull request! Passing tests that demonstrate your parser in action will help us evaluate the parser. If you need more guidance for your custom parser or your tests, peruse any of the [custom extractors](./) and their accompanying tests.

@ -1,4 +1,4 @@
const BloggerExtractor = {
export const BloggerExtractor = {
domain: 'blogspot.com',
content: {
// Blogger is insane and does not load its content
@ -36,5 +36,3 @@ const BloggerExtractor = {
],
},
};
export default BloggerExtractor;

@ -1,7 +1,7 @@
const NYMagExtractor = {
export const NYMagExtractor = {
domain: 'nymag.com',
content: {
// Order by most likely. Extractor will stop on first occurence
// Order by most likely. Extractor will stop on first occurrence
selectors: [
'div.article-content',
'section.body',
@ -64,5 +64,3 @@ const NYMagExtractor = {
],
},
};
export default NYMagExtractor;

@ -1,4 +1,4 @@
const TwitterExtractor = {
export const TwitterExtractor = {
domain: 'twitter.com',
content: {
@ -46,6 +46,3 @@ const TwitterExtractor = {
},
};
export default TwitterExtractor;

@ -1,4 +1,4 @@
const WikipediaExtractor = {
export const WikipediaExtractor = {
domain: 'wikipedia.org',
content: {
selectors: [
@ -45,5 +45,3 @@ const WikipediaExtractor = {
},
};
export default WikipediaExtractor;

@ -0,0 +1,64 @@
export const NYTimesExtractor = {
title: {
selectors: [
'.g-headline',
'h1.headline',
],
},
author: {
selectors: [
'.g-byline',
'.byline',
],
},
content: {
selectors: [
'div.g-blocks',
'article#story',
],
defaultCleaner: false,
transforms: {
'img.g-lazy': ($node) => {
let src = $node.attr('src');
// const widths = $node.attr('data-widths')
// .slice(1)
// .slice(0, -1)
// .split(',');
// if (widths.length) {
// width = widths.slice(-1);
// } else {
// width = '900';
// }
const width = 640;
src = src.replace('{{size}}', width);
$node.attr('src', src);
},
},
clean: [
'.ad',
'header#story-header',
'.story-body-1 .lede.video',
'.visually-hidden',
'#newsletter-promo',
'.promo',
'.comments-button',
'.hidden',
],
},
date_published: null,
lead_image_url: null,
dek: null,
next_page_url: null,
excerpt: null,
};

@ -0,0 +1,39 @@
import assert from 'assert';
import fs from 'fs';
import cheerio from 'cheerio';
import Mercury from 'mercury';
describe('NYTimesExtractor', () => {
it('works with a feature story', (async) () => {
const html = fs.readFileSync('./fixtures/www.nytimes.com/1474061823854.html');
const uri = 'http://www.nytimes.com/interactive/2016/09/15/arts/design/national-museum-of-african-american-history-and-culture.html';
const { content, title, author } = await Mercury.parse(uri, html);
const $ = cheerio.load(content);
const text = $('*').first()
.text()
.trim()
.slice(0, 20);
assert.equal(title, 'I, Too, Sing America');
assert.equal(author, 'The New York Times');
assert.equal(text, 'T he Smithsonians N');
});
it('works with a regular news story', (async) () => {
const html = fs.readFileSync('./fixtures/www.nytimes.com/1474318141888.html');
const uri = 'http://www.nytimes.com/2016/09/20/nyregion/nyc-nj-explosions-ahmad-khan-rahami.html';
const { content, title, author } = await Mercury.parse(uri, html);
const $ = cheerio.load(content);
const text = $('*').first()
.text()
.trim()
.slice(0, 20);
assert.equal(title, 'Ahmad Khan Rahami Is Arrested in Manhattan and New Jersey Bombings');
assert.equal(author, 'MARC SANTORA, WILLIAM K. RASHBAUM, AL BAKER and ADAM GOLDMAN');
assert.equal(text, 'The man believed to ');
});
});

@ -0,0 +1,44 @@
// Rename CustomExtractor
// to fit your publication
export const TheAtlanticExtractor = {
domain: 'www.theatlantic.com',
title: {
selectors: [
'h1.hed',
],
},
author: {
selectors: [
'article#article .article-cover-extra .metadata .byline a',
]
},
content: {
selectors: [
'.article-body',
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [
],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
]
},
date_published: null,
lead_image_url: null,
dek: null,
next_page_url: null,
excerpt: null,
}

@ -0,0 +1,40 @@
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
// Rename CustomExtractor
describe('CustomExtractor', () => {
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ./src/extractors/custom/www.theatlantic.com/index.js
// then add your new extractor to
// src/extractors/all.js
const url = 'http://www.theatlantic.com/technology/archive/2016/09/why-new-yorkers-got-a-push-alert-about-a-manhunt/500591/';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname)
})
it('works with a starter story', (async) () => {
// To pass this test, begin filling out your
// selectors in ./src/extractors/custom/www.theatlantic.com/index.js. This test is just
// a stub; you can add more fields to test as much of
// your parser as possible.
const html = fs.readFileSync('./fixtures/www.theatlantic.com/1474321707642.html');
const uri = 'http://www.theatlantic.com/technology/archive/2016/09/why-new-yorkers-got-a-push-alert-about-a-manhunt/500591/';
const { content, title, author } = await Mercury.parse(uri, html);
const $ = cheerio.load(content);
const text = $('*').first()
.text()
.trim()
.slice(0, 20);
assert.equal(title, 'Why New Yorkers Received a Push Alert About a Manhunt');
assert.equal(author, 'Kaveh Waddell');
assert.equal(text, 'Updated on September');
});
});

@ -7,7 +7,7 @@ import { ATTR_RE } from './constants';
// Remove elements by an array of selectors
export function cleanBySelectors($content, $, { clean }) {
if (!clean) return null;
if (!clean) return $content;
$(clean.join(','), $content).remove();
@ -16,7 +16,7 @@ export function cleanBySelectors($content, $, { clean }) {
// Transform matching elements
export function transformElements($content, $, { transforms }) {
if (!transforms) return null;
if (!transforms) return $content;
Reflect.ownKeys(transforms).forEach((key) => {
const $matches = $(key, $content);

@ -63,6 +63,12 @@ const Mercury = {
return result;
},
// A convenience method for getting a resource
// to work with, e.g., for custom extractor generator
async fetchResource(url) {
return await Resource.create(url);
},
};
export default Mercury;

@ -2,12 +2,15 @@
# Runs the mocha tests
if [ $BASH_ARGV ]; then
FILES=$(find src -name "*$BASH_ARGV*.test.js")
echo Running test for $FILES...
if [ -e "$BASH_ARGV" ]; then
FILES=$BASH_ARGV
else
FILES=$(find src -name "*$BASH_ARGV*.test.js")
fi
echo Running test for $FILES
else
echo Running all tests...
FILES=$(find src -name "*.test.js")
fi
mocha --reporter spec --compilers js:babel-register $FILES --require babel-polyfill

Loading…
Cancel
Save