feat: generator for custom parsers and some documentation
Squashed commit of the following: commit deaf9e60d031d9ee06e74b8c0895495b187032a5 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 20 10:31:09 2016 -0400 chore: README for custom parsers commit a8e8ad633e0d1576a52dbc90ce31b98fb2ec21ee Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 23:36:09 2016 -0400 draft of readme commit 4f0f463f821465c282ce006378e5d55f8f41df5f Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:56:34 2016 -0400 custom extractor used to build basic parser for theatlantic commit c5562a3cede41f56c4e723dcfa1181b49dcaae4d Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:20:13 2016 -0400 pre-commit to test custom parser generator commit 7d50d5b7ab780b79fae38afcb87a7d1da5d139b2 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:19:55 2016 -0400 feat: added nytimes parser commit 58b8d83a56927177984ddfdf70830bc4f328f200 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 17:17:28 2016 -0400 feat: can do fuzzy search or go straight to file commit c99add753723a8e2ac64d51d7379ac8e23125526 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 19 10:52:26 2016 -0400 refactored export for custom extractors for easier renames commit 22563413669651bb497f1bb2a92085b71f2ae324 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 17:36:13 2016 -0400 feat: custom extractor generation in place commit 2285a29908a7f82a5de3c81f6b2b902ddec9bdaa Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 16 16:42:20 2016 -0400 good progresspull/5/head
parent
c4f06c7ebc
commit
8f42e119e8
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,97 @@
|
||||
import fs from 'fs'
|
||||
import URL from 'url'
|
||||
import inquirer from 'inquirer'
|
||||
import ora from 'ora'
|
||||
|
||||
import Mercury from '../dist/mercury'
|
||||
import extractorTemplate from './templates/custom-extractor'
|
||||
import extractorTestTemplate from './templates/custom-extractor-test'
|
||||
|
||||
const questions = [
|
||||
{
|
||||
type: 'input',
|
||||
name: 'website',
|
||||
message: 'Paste a url to an article you\'d like to create or extend a parser for:',
|
||||
validate(value) {
|
||||
const { hostname } = URL.parse(value);
|
||||
if (hostname) return true;
|
||||
|
||||
return false;
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
inquirer.prompt(questions).then((answers) => {
|
||||
scaffoldCustomParser(answers.website);
|
||||
});
|
||||
|
||||
let spinner;
|
||||
function confirm(fn, args, msg, newParser) {
|
||||
spinner = ora({ text: msg });
|
||||
spinner.start();
|
||||
const result = fn.apply(null, args);
|
||||
|
||||
if (result && result.then) {
|
||||
result.then(r => savePage(r, args, newParser));
|
||||
} else {
|
||||
spinner.succeed();
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
function savePage($, [url], newParser) {
|
||||
const { hostname } = URL.parse(url);
|
||||
|
||||
spinner.succeed();
|
||||
|
||||
const filename = new Date().getTime();
|
||||
const file = `./fixtures/${hostname}/${filename}.html`;
|
||||
|
||||
fs.writeFileSync(file, $.html());
|
||||
|
||||
if (newParser) {
|
||||
confirm(generateScaffold, [url, file], 'Generating parser and tests');
|
||||
console.log(`Your custom site extractor has been set up. To get started building it, run
|
||||
npm test -- ${getDir(url)}/index.test.js`)
|
||||
} else {
|
||||
console.log(`It looks like you already have a custom parser for this url.
|
||||
The page you linked to has been added to ${file}. Copy and paste
|
||||
the following code to use that page in your tests:
|
||||
const html = fs.readFileSync('${file}');`)
|
||||
}
|
||||
}
|
||||
|
||||
function generateScaffold(url, file) {
|
||||
const { hostname } = URL.parse(url);
|
||||
const extractor = extractorTemplate(hostname)
|
||||
const extractorTest = extractorTestTemplate(file, url, getDir(url))
|
||||
|
||||
fs.writeFileSync(`${getDir(url)}/index.js`, extractor)
|
||||
fs.writeFileSync(`${getDir(url)}/index.test.js`, extractorTest)
|
||||
}
|
||||
|
||||
function confirmCreateDir(dir, msg) {
|
||||
if (!fs.existsSync(dir)) {
|
||||
confirm(fs.mkdirSync, [dir], msg);
|
||||
}
|
||||
}
|
||||
|
||||
function scaffoldCustomParser(url) {
|
||||
const dir = getDir(url);
|
||||
const { hostname } = URL.parse(url);
|
||||
let newParser = false
|
||||
|
||||
if (!fs.existsSync(dir)) {
|
||||
newParser = true
|
||||
confirmCreateDir(dir, `Creating ${hostname} directory`);
|
||||
confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory');
|
||||
}
|
||||
|
||||
confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser);
|
||||
}
|
||||
|
||||
function getDir(url) {
|
||||
const { hostname } = URL.parse(url);
|
||||
return `./src/extractors/custom/${hostname}`;
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
import babel from 'rollup-plugin-babel';
|
||||
import babelrc from 'babelrc-rollup';
|
||||
import commonjs from 'rollup-plugin-commonjs';
|
||||
|
||||
export default {
|
||||
entry: './scripts/generate-custom-parser.js',
|
||||
plugins: [
|
||||
commonjs(),
|
||||
babel(babelrc()),
|
||||
],
|
||||
format: 'cjs',
|
||||
dest: 'dist/generate-custom-parser.js', // equivalent to --output
|
||||
sourceMap: true,
|
||||
};
|
@ -0,0 +1,48 @@
|
||||
import template from './index';
|
||||
|
||||
export default function (file, url, dir) {
|
||||
return template`
|
||||
import assert from 'assert';
|
||||
import fs from 'fs';
|
||||
import URL from 'url';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import Mercury from 'mercury';
|
||||
import getExtractor from 'extractors/get-extractor';
|
||||
|
||||
// Rename CustomExtractor
|
||||
describe('CustomExtractor', () => {
|
||||
it('is selected properly', () => {
|
||||
// To pass this test, rename your extractor in
|
||||
// ${dir}/index.js
|
||||
// then add your new extractor to
|
||||
// src/extractors/all.js
|
||||
const url = '${url}';
|
||||
const extractor = getExtractor(url);
|
||||
assert.equal(extractor.domain, URL.parse(url).hostname)
|
||||
})
|
||||
|
||||
it('works with a starter story', (async) () => {
|
||||
// To pass this test, begin filling out your
|
||||
// selectors in ${dir}/index.js. This test is just
|
||||
// a stub; you can add more fields to test as much of
|
||||
// your parser as possible.
|
||||
const html = fs.readFileSync('${file}');
|
||||
const uri = '${url}';
|
||||
|
||||
const { content, title, author } = await Mercury.parse(uri, html);
|
||||
const $ = cheerio.load(content);
|
||||
const text = $('*').first()
|
||||
.text()
|
||||
.trim()
|
||||
.slice(0, 20);
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(title, '');
|
||||
assert.equal(author, '');
|
||||
assert.equal(text, '');
|
||||
});
|
||||
});
|
||||
`;
|
||||
}
|
@ -0,0 +1,44 @@
|
||||
import template from './index';
|
||||
|
||||
export default function (hostname) {
|
||||
return template`
|
||||
// Rename CustomExtractor
|
||||
// to fit your publication
|
||||
export const CustomExtractor = {
|
||||
domain: '${hostname}',
|
||||
title: {
|
||||
selectors: [
|
||||
// enter title selectors
|
||||
],
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
// enter content selectors
|
||||
],
|
||||
|
||||
// Is there anything in the content you selected that needs transformed
|
||||
// before it's consumable content? E.g., unusual lazy loaded images
|
||||
transforms: [
|
||||
],
|
||||
|
||||
// Is there anything that is in the result that shouldn't be?
|
||||
// The clean selectors will remove anything that matches from
|
||||
// the result
|
||||
clean: [
|
||||
|
||||
]
|
||||
},
|
||||
|
||||
date_published: null,
|
||||
|
||||
lead_image_url: null,
|
||||
|
||||
dek: null,
|
||||
|
||||
next_page_url: null,
|
||||
|
||||
excerpt: null,
|
||||
}
|
||||
`;
|
||||
}
|
@ -0,0 +1,28 @@
|
||||
import insertValues from './insert-values'
|
||||
|
||||
const bodyPattern = /^\n([\s\S]+)\s{2}$/gm;
|
||||
const trailingWhitespace = /\s+$/;
|
||||
|
||||
export default function template(strings, ...values) {
|
||||
const compiled = insertValues(strings, ...values);
|
||||
let [body] = compiled.match(bodyPattern) || [];
|
||||
let indentLevel = /^\s{0,4}(.+)$/g;
|
||||
|
||||
if (!body) {
|
||||
body = compiled;
|
||||
indentLevel = /^\s{0,2}(.+)$/g;
|
||||
}
|
||||
|
||||
return body.split('\n')
|
||||
.slice(1)
|
||||
.map((line) => {
|
||||
line = line.replace(indentLevel, '$1');
|
||||
|
||||
if (trailingWhitespace.test(line)) {
|
||||
line = line.replace(trailingWhitespace, '');
|
||||
}
|
||||
|
||||
return line;
|
||||
})
|
||||
.join('\n');
|
||||
}
|
@ -0,0 +1,17 @@
|
||||
export default function insertValues(strings, ...values) {
|
||||
if (values.length) {
|
||||
return strings.reduce((result, part, idx) => {
|
||||
let value = values[idx];
|
||||
|
||||
if (value && typeof value.toString === 'function') {
|
||||
value = value.toString();
|
||||
} else {
|
||||
value = '';
|
||||
}
|
||||
|
||||
return result + part + value;
|
||||
}, '');
|
||||
}
|
||||
|
||||
return strings.join('');
|
||||
}
|
@ -1,13 +1,17 @@
|
||||
import NYMagExtractor from './custom/nymag.com';
|
||||
import BloggerExtractor from './custom/blogspot.com';
|
||||
import WikipediaExtractor from './custom/wikipedia.org';
|
||||
import TwitterExtractor from './custom/twitter.com';
|
||||
import { NYMagExtractor } from './custom/nymag.com';
|
||||
import { BloggerExtractor } from './custom/blogspot.com';
|
||||
import { WikipediaExtractor } from './custom/wikipedia.org';
|
||||
import { TwitterExtractor } from './custom/twitter.com';
|
||||
import { NYTimesExtractor } from './custom/www.nytimes.com';
|
||||
import { TheAtlanticExtractor } from './custom/www.theatlantic.com';
|
||||
|
||||
const Extractors = {
|
||||
'nymag.com': NYMagExtractor,
|
||||
'blogspot.com': BloggerExtractor,
|
||||
'wikipedia.org': WikipediaExtractor,
|
||||
'twitter.com': TwitterExtractor,
|
||||
'www.nytimes.com': NYTimesExtractor,
|
||||
'www.theatlantic.com': TheAtlanticExtractor,
|
||||
};
|
||||
|
||||
export default Extractors;
|
||||
|
@ -0,0 +1,137 @@
|
||||
# Custom Parsers
|
||||
|
||||
Mercury can extract meaningful content from almost any web site, but custom parsers allow the Mercury parser to find the content more quickly and more accurately than it might otherwise do. Our goal is to include custom parsers as many sites as we can, and we'd love your help!
|
||||
|
||||
## How to generate a custom parser
|
||||
|
||||
Take a look at the live custom parsers in [`src/extractors/custom`](/src/extractors/custom) for examples and to check if the site you want to write a parser for already exists.
|
||||
|
||||
To generate a new custom parser, run:
|
||||
|
||||
```bash
|
||||
npm run generate-custom-parser
|
||||
```
|
||||
|
||||
This script will prompt you to paste a link to an article you want to parse. The URL you choose will serve as the example your parser tests against. The script will also generate your custom parser and a barebones (and failing) test for your parser.
|
||||
|
||||
At that point, you'll be prompted to run:
|
||||
|
||||
```bash
|
||||
npm test
|
||||
```
|
||||
|
||||
This will run the test for your parser, which will fail (which makes sense — you haven't written it yet!). Your goal now is to follow the instructions in the generated `<example.com>/index.test.js` and `<example.com>/index.js` files until they pass!
|
||||
|
||||
## How to write a custom parser
|
||||
|
||||
Custom parsers allow you to write CSS selectors that will find the content you're looking for on the page you're testing against. If you're familiar with jQuery, the selectors work exactly the same way.
|
||||
|
||||
You can query for every field returned by the Mercury Parser:
|
||||
|
||||
- title
|
||||
- author
|
||||
- content
|
||||
- date_published
|
||||
- lead_image_url
|
||||
- dek
|
||||
- next_page_url
|
||||
- excerpt
|
||||
|
||||
### Using selectors
|
||||
|
||||
To demonstrate, let's start with something simple: Your selector for the page's title might look something like this:
|
||||
|
||||
```javascript
|
||||
export const ExampleExtractor = {
|
||||
...
|
||||
|
||||
// Order by most likely. Extractor will stop on first occurrence
|
||||
title: {
|
||||
selectors: [
|
||||
'h1.hed',
|
||||
],
|
||||
},
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
As you might guess, the selectors key provides an array of selectors that Mercury will check to find your title text. In our ExampleExtractor, we're saying that the title can be found in the text of an `h1` header with a class name of `hed`.
|
||||
|
||||
The selector you choose should return one element. If more than one element is returned by your selector, it will fail (and Mercury will fall back to its generic extractor).
|
||||
|
||||
This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception.
|
||||
|
||||
### Cleaning content
|
||||
|
||||
An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.
|
||||
|
||||
For example, sometimes an article's content will contain related content that doesn't translate or render well when you just want to see the article's content. The clean key allows you to provide an array of selectors identifying elements that should be removed from the content.
|
||||
|
||||
Here's an example:
|
||||
|
||||
```javascript
|
||||
export const ExampleExtractor = {
|
||||
...
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
'div.g-blocks',
|
||||
'article#story',
|
||||
],
|
||||
|
||||
// Selectors to remove from the extracted content
|
||||
clean: [
|
||||
'.related',
|
||||
'.hidden',
|
||||
],
|
||||
}
|
||||
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
### Using transforms
|
||||
|
||||
Occasionally, in order to mold the article content to a form that's readable outside the page, you need to transform a few elements inside the content you've chosen. That's where `transforms` come in.
|
||||
|
||||
This example demonstrates a simple tranform that converts h1 headers to h2 headers, along with a more complex transform that transforms lazy-loaded images to images that will render as you would expect outside the context of the site you're extracting from.
|
||||
|
||||
```javascript
|
||||
export const ExampleExtractor = {
|
||||
...
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
'div.article-content',
|
||||
],
|
||||
|
||||
transforms: {
|
||||
// In a simple tranform, each key is the selector,
|
||||
// and the value, provided it's a string, represents
|
||||
// the tag that the matched item should be transformed to.
|
||||
|
||||
// Convert h1s to h2s
|
||||
h1: 'h2',
|
||||
|
||||
// If a function is given as the value, it should return a string
|
||||
// to convert to or nothing (in which case it will not perform
|
||||
// the transformation.
|
||||
|
||||
// Convert lazy-loaded noscript images to figures
|
||||
noscript: ($node) => {
|
||||
const $children = $node.children();
|
||||
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||
return 'figure';
|
||||
}
|
||||
|
||||
return null;
|
||||
},
|
||||
},
|
||||
},
|
||||
```
|
||||
|
||||
For much more complex tranforms, you can perform dom manipulation within the tranform function, but this is discouraged unless absolutely necessary. See, for example, the lazy-loaded image transform in [the NYTimesExtractor](www.nytimes.com/index.js#L25), which transforms the src attribute on the lazy-loaded image.
|
||||
|
||||
## Submitting a custom extractor
|
||||
|
||||
If you've written a custom extractor, please send us a pull request! Passing tests that demonstrate your parser in action will help us evaluate the parser. If you need more guidance for your custom parser or your tests, peruse any of the [custom extractors](./) and their accompanying tests.
|
@ -0,0 +1,64 @@
|
||||
export const NYTimesExtractor = {
|
||||
title: {
|
||||
selectors: [
|
||||
'.g-headline',
|
||||
'h1.headline',
|
||||
],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'.g-byline',
|
||||
'.byline',
|
||||
],
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
'div.g-blocks',
|
||||
'article#story',
|
||||
],
|
||||
|
||||
defaultCleaner: false,
|
||||
|
||||
transforms: {
|
||||
'img.g-lazy': ($node) => {
|
||||
let src = $node.attr('src');
|
||||
// const widths = $node.attr('data-widths')
|
||||
// .slice(1)
|
||||
// .slice(0, -1)
|
||||
// .split(',');
|
||||
// if (widths.length) {
|
||||
// width = widths.slice(-1);
|
||||
// } else {
|
||||
// width = '900';
|
||||
// }
|
||||
const width = 640;
|
||||
|
||||
src = src.replace('{{size}}', width);
|
||||
$node.attr('src', src);
|
||||
},
|
||||
},
|
||||
|
||||
clean: [
|
||||
'.ad',
|
||||
'header#story-header',
|
||||
'.story-body-1 .lede.video',
|
||||
'.visually-hidden',
|
||||
'#newsletter-promo',
|
||||
'.promo',
|
||||
'.comments-button',
|
||||
'.hidden',
|
||||
],
|
||||
},
|
||||
|
||||
date_published: null,
|
||||
|
||||
lead_image_url: null,
|
||||
|
||||
dek: null,
|
||||
|
||||
next_page_url: null,
|
||||
|
||||
excerpt: null,
|
||||
};
|
@ -0,0 +1,44 @@
|
||||
// Rename CustomExtractor
|
||||
// to fit your publication
|
||||
export const TheAtlanticExtractor = {
|
||||
domain: 'www.theatlantic.com',
|
||||
title: {
|
||||
selectors: [
|
||||
'h1.hed',
|
||||
],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'article#article .article-cover-extra .metadata .byline a',
|
||||
]
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
'.article-body',
|
||||
],
|
||||
|
||||
// Is there anything in the content you selected that needs transformed
|
||||
// before it's consumable content? E.g., unusual lazy loaded images
|
||||
transforms: [
|
||||
],
|
||||
|
||||
// Is there anything that is in the result that shouldn't be?
|
||||
// The clean selectors will remove anything that matches from
|
||||
// the result
|
||||
clean: [
|
||||
|
||||
]
|
||||
},
|
||||
|
||||
date_published: null,
|
||||
|
||||
lead_image_url: null,
|
||||
|
||||
dek: null,
|
||||
|
||||
next_page_url: null,
|
||||
|
||||
excerpt: null,
|
||||
}
|
@ -0,0 +1,40 @@
|
||||
import assert from 'assert';
|
||||
import fs from 'fs';
|
||||
import URL from 'url';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import Mercury from 'mercury';
|
||||
import getExtractor from 'extractors/get-extractor';
|
||||
|
||||
// Rename CustomExtractor
|
||||
describe('CustomExtractor', () => {
|
||||
it('is selected properly', () => {
|
||||
// To pass this test, rename your extractor in
|
||||
// ./src/extractors/custom/www.theatlantic.com/index.js
|
||||
// then add your new extractor to
|
||||
// src/extractors/all.js
|
||||
const url = 'http://www.theatlantic.com/technology/archive/2016/09/why-new-yorkers-got-a-push-alert-about-a-manhunt/500591/';
|
||||
const extractor = getExtractor(url);
|
||||
assert.equal(extractor.domain, URL.parse(url).hostname)
|
||||
})
|
||||
|
||||
it('works with a starter story', (async) () => {
|
||||
// To pass this test, begin filling out your
|
||||
// selectors in ./src/extractors/custom/www.theatlantic.com/index.js. This test is just
|
||||
// a stub; you can add more fields to test as much of
|
||||
// your parser as possible.
|
||||
const html = fs.readFileSync('./fixtures/www.theatlantic.com/1474321707642.html');
|
||||
const uri = 'http://www.theatlantic.com/technology/archive/2016/09/why-new-yorkers-got-a-push-alert-about-a-manhunt/500591/';
|
||||
|
||||
const { content, title, author } = await Mercury.parse(uri, html);
|
||||
const $ = cheerio.load(content);
|
||||
const text = $('*').first()
|
||||
.text()
|
||||
.trim()
|
||||
.slice(0, 20);
|
||||
|
||||
assert.equal(title, 'Why New Yorkers Received a Push Alert About a Manhunt');
|
||||
assert.equal(author, 'Kaveh Waddell');
|
||||
assert.equal(text, 'Updated on September');
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue