Merge pull request #12 from postlight/feat-broadwayworld-extractor

feat: Add custom parser for broadwayworld.com
pull/13/head
Adam Pash 8 years ago committed by GitHub
commit 4ebfd37408

55
dist/mercury.js vendored

@ -983,6 +983,57 @@ var DeadspinExtractor = {
}
};
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BroadwayWorldExtractor = {
domain: 'www.broadwayworld.com',
title: {
selectors: ['h1.article-title']
},
author: {
selectors: ['span[itemprop=author]']
},
content: {
selectors: ['div[itemprop=articlebody]'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
},
date_published: {
selectors: [['meta[itemprop=datePublished]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
dek: {
selectors: [['meta[name="og:description"]', 'value']]
},
next_page_url: {
selectors: [
// enter selectors
]
},
excerpt: {
selectors: [
// enter selectors
]
}
};
var Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
@ -998,8 +1049,8 @@ var Extractors = {
'fandom.wikia.com': WikiaExtractor,
'www.littlethings.com': LittleThingsExtractor,
'www.politico.com': PoliticoExtractor,
'deadspin.com': DeadspinExtractor
'deadspin.com': DeadspinExtractor,
'www.broadwayworld.com': BroadwayWorldExtractor
};
// Spacer images to be removed

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -13,7 +13,7 @@ import { WikiaExtractor } from './custom/fandom.wikia.com';
import { LittleThingsExtractor } from './custom/www.littlethings.com';
import { PoliticoExtractor } from './custom/www.politico.com';
import { DeadspinExtractor } from './custom/deadspin.com';
import { BroadwayWorldExtractor } from './custom/www.broadwayworld.com';
const Extractors = {
'nymag.com': NYMagExtractor,
@ -31,7 +31,7 @@ const Extractors = {
'www.littlethings.com': LittleThingsExtractor,
'www.politico.com': PoliticoExtractor,
'deadspin.com': DeadspinExtractor,
'www.broadwayworld.com': BroadwayWorldExtractor,
};
export default Extractors;

@ -0,0 +1,65 @@
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
export const BroadwayWorldExtractor = {
domain: 'www.broadwayworld.com',
title: {
selectors: [
'h1.article-title',
],
},
author: {
selectors: [
'span[itemprop=author]',
],
},
content: {
selectors: [
'div[itemprop=articlebody]',
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
},
date_published: {
selectors: [
['meta[itemprop=datePublished]', 'value'],
],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
},
dek: {
selectors: [
['meta[name="og:description"]', 'value'],
],
},
next_page_url: {
selectors: [
// enter selectors
],
},
excerpt: {
selectors: [
// enter selectors
],
},
};

@ -0,0 +1,130 @@
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
// Rename CustomExtractor
describe('CustomExtractor', () => {
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ./src/extractors/custom/www.broadwayworld.com/index.js
// (e.g., CustomExtractor => NYTimesExtractor)
// then add your new extractor to
// src/extractors/all.js
const url =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', async () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.broadwayworld.com/index.js.
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
const articleUrl =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const { title } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(title, 'American Theatre Wing Launches Andrew Lloyd Webber Training Scholarships');
});
it('returns the author', async () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.broadwayworld.com/index.js.
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
const articleUrl =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const { author } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(author, 'BWW News Desk');
});
it('returns the date_published', async () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.broadwayworld.com/index.js.
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
const articleUrl =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const { date_published } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-10-13T19:35:00.000Z');
});
it('returns the dek', async () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.broadwayworld.com/index.js.
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
const articleUrl =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const { dek } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(dek, 'The American Theatre Wing announced today that their Andrew Lloyd Webber Initiative has launched its second initiative program, the Training Scholarships, bridging the gap between talent and opportunity and creating a strong pipeline to the professional theatre for promising artists of all backgrounds.');
});
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.broadwayworld.com/index.js.
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
const articleUrl =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const { lead_image_url } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://images.bwwstatic.com/columnpic7/7B5FD766-A644-E386-19DE07017A3AD79C.jpg');
});
it('returns the content', async () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.broadwayworld.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.broadwayworld.com/1476392567143.html');
const url =
'http://www.broadwayworld.com/article/American-Theatre-Wing-Launches-Andrew-Lloyd-Webber-Training-Scholarships-20161013';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');
const first13 = excerptContent($('*').first().text(), 13);
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The American Theatre Wing announced today that their Andrew Lloyd Webber Initiative has');
});
});
Loading…
Cancel
Save