Merge remote-tracking branch 'origin/master'

pull/12/head
Toy Vano 8 years ago
commit 84619641b0

138
dist/mercury.js vendored

@ -713,6 +713,139 @@ var MSNExtractor = {
excerpt: null
};
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var YahooExtractor = {
domain: 'www.yahoo.com',
title: {
selectors: ['header.canvas-header']
},
author: {
selectors: ['span.provider-name']
},
content: {
selectors: [
// enter content selectors
'.content-canvas'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['.figure-caption']
},
date_published: {
selectors: ['time.date']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
dek: {
selectors: [['meta[name="og:description"]', 'value']]
},
next_page_url: null,
excerpt: null
};
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var BuzzfeedExtractor = {
domain: 'www.buzzfeed.com',
title: {
selectors: ['h1[id="post-title"]']
},
author: {
selectors: ['a[data-action="user/username"]', 'byline__author']
},
content: {
selectors: [
// enter content selectors
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
},
date_published: {
selectors: ['.buzz-datetime']
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
dek: {
selectors: [['meta[name="description"]', 'value']]
},
next_page_url: null,
excerpt: null
};
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
var WikiaExtractor = {
domain: 'fandom.wikia.com',
title: {
selectors: ['h1.entry-title']
},
author: {
selectors: ['.author vcard', '.fn']
},
content: {
selectors: ['.grid-content', '.entry-content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: []
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
},
lead_image_url: {
selectors: [['meta[name="og:image"]', 'value']]
},
dek: {
selectors: [['meta[name="og:description"]', 'value']]
},
next_page_url: null,
excerpt: null
};
var Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
@ -722,7 +855,10 @@ var Extractors = {
'www.theatlantic.com': TheAtlanticExtractor,
'www.newyorker.com': NewYorkerExtractor,
'www.wired.com': WiredExtractor,
'www.msn.com': MSNExtractor
'www.msn.com': MSNExtractor,
'www.yahoo.com': YahooExtractor,
'www.buzzfeed.com': BuzzfeedExtractor,
'fandom.wikia.com': WikiaExtractor
};

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -7,6 +7,9 @@ import { TheAtlanticExtractor } from './custom/www.theatlantic.com';
import { NewYorkerExtractor } from './custom/www.newyorker.com';
import { WiredExtractor } from './custom/www.wired.com';
import { MSNExtractor } from './custom/www.msn.com';
import { YahooExtractor } from './custom/www.yahoo.com';
import { BuzzfeedExtractor } from './custom/www.buzzfeed.com';
import { WikiaExtractor } from './custom/fandom.wikia.com';
const Extractors = {
@ -19,6 +22,9 @@ const Extractors = {
'www.newyorker.com': NewYorkerExtractor,
'www.wired.com': WiredExtractor,
'www.msn.com': MSNExtractor,
'www.yahoo.com': YahooExtractor,
'www.buzzfeed.com': BuzzfeedExtractor,
'fandom.wikia.com': WikiaExtractor,
};

@ -0,0 +1,61 @@
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
export const WikiaExtractor = {
domain: 'fandom.wikia.com',
title: {
selectors: [
'h1.entry-title',
// enter title selectors
],
},
author: {
selectors: [
'.author vcard', '.fn',
// enter author selectors
],
},
content: {
selectors: [
'.grid-content',
'.entry-content',
// enter content selectors
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [
],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
},
date_published: {
selectors: [
['meta[name="article:published_time"]', 'value'],
],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
},
dek: {
selectors: [
['meta[name="og:description"]', 'value'],
],
},
next_page_url: null,
excerpt: null,
};

@ -0,0 +1,134 @@
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
// Rename CustomExtractor
describe('WikiaExtractor', () => {
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ./src/extractors/custom/fandom.wikia.com/index.js
// (e.g., CustomExtractor => NYTimesExtractor)
// then add your new extractor to
// src/extractors/all.js
const url =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', ((async)) () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/fandom.wikia.com/index.js.
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
const articleUrl =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const { title } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(title, 'Box Office: Its Good to Be Peculiar');
});
it('returns the author', ((async)) () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/fandom.wikia.com/index.js.
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
const articleUrl =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const { author } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(author, 'Drew Dietsch');
});
it('returns the date_published', ((async)) () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/fandom.wikia.com/index.js.
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
const articleUrl =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const { date_published } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-10-03T02:30:57.000Z');
});
it('returns the dek', ((async)) () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/fandom.wikia.com/index.js.
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
const articleUrl =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const { dek } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Tim Burton once again claimed the top spot at the box office. Miss Peregrines Home for Peculiar Children secured a respectable #1 showing and may have some staying power in the coming weeks. All in all, its not a huge win but its good enough. Meanwhile, Deepwater Horizon performed about as well as expected. Seeing as how […]');
});
it('returns the lead_image_url', ((async)) () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/fandom.wikia.com/index.js.
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
const articleUrl =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const { lead_image_url } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'http://fandom.wikia.com/wp-content/uploads/2016/10/box-office-peculiar-feature-hero.jpg');
});
it('returns the content', ((async)) () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/fandom.wikia.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/fandom.wikia.com/1475595373938.html');
const url =
'http://fandom.wikia.com/articles/box-office-good-peculiar';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');
const first13 = $('*').first()
.text()
.trim()
.split(/\s+/)
.slice(0, 13)
.join(' ');
// Update these values with the expected values from
// the article.
assert.equal(first13, 'Tim Burton once again claimed the top spot at the box office. Miss');
});
});

@ -0,0 +1,62 @@
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
export const BuzzfeedExtractor = {
domain: 'www.buzzfeed.com',
title: {
selectors: [
'h1[id="post-title"]',
// enter title selectors
],
},
author: {
selectors: [
'a[data-action="user/username"]', 'byline__author',
// enter author selectors
],
},
content: {
selectors: [
'#buzz_sub_buzz', '.bf_dom', 'div[rel:gt_cat="[ttp]:content"]',
// enter content selectors
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [
],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
],
},
date_published: {
selectors: [
'.buzz-datetime',
// enter author selectors
],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
},
dek: {
selectors: [
['meta[name="description"]', 'value'],
],
},
next_page_url: null,
excerpt: null,
};

@ -0,0 +1,134 @@
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
// Rename CustomExtractor
describe('BuzzfeedExtractor', () => {
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ./src/extractors/custom/www.buzzfeed.com/index.js
// (e.g., CustomExtractor => NYTimesExtractor)
// then add your new extractor to
// src/extractors/all.js
const url =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', ((async)) () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
const articleUrl =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const { title } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(title, 'People Are Calling Out This Edited Picture Of Demi Lovato For Body-Shaming Her');
});
it('returns the author', ((async)) () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
const articleUrl =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const { author } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(author, 'Ikran Dahir');
});
it('returns the date_published', ((async)) () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
const articleUrl =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const { date_published } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(date_published, ' ');
});
it('returns the dek', ((async)) () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
const articleUrl =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const { dek } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(dek, 'Lovato said: "Is that how my boobs should look?"..');
});
it('returns the lead_image_url', ((async)) () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
const articleUrl =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const { lead_image_url } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-10/3/12/social_promotion/buzzfeed-prod-fastlane01/facebook-social-promotion-17757-1475512210-1.jpg');
});
it('returns the content', ((async)) () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
const url =
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');
const first13 = $('*').first()
.text()
.trim()
.split(/\s+/)
.slice(0, 13)
.join(' ');
// Update these values with the expected values from
// the article.
assert.equal(first13, 'A few months ago, Vladimir Serbanescu, a 17-year-old artist from Romania, drew this');
});
});

@ -0,0 +1,62 @@
// Rename CustomExtractor
// to fit your publication
// (e.g., NYTimesExtractor)
export const YahooExtractor = {
domain: 'www.yahoo.com',
title: {
selectors: [
'header.canvas-header',
// enter title selectors
],
},
author: {
selectors: [
'span.provider-name',
// enter author selectors
],
},
content: {
selectors: [
// enter content selectors
'.content-canvas',
],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: [
],
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'.figure-caption',
],
},
date_published: {
selectors: [
'time.date',
],
},
lead_image_url: {
selectors: [
['meta[name="og:image"]', 'value'],
],
},
dek: {
selectors: [
['meta[name="og:description"]', 'value'],
// enter dek selectors
],
},
next_page_url: null,
excerpt: null,
};

@ -0,0 +1,134 @@
import assert from 'assert';
import fs from 'fs';
import URL from 'url';
import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
// Rename CustomExtractor
describe('YahooExtractor', () => {
it('is selected properly', () => {
// To pass this test, rename your extractor in
// ./src/extractors/custom/www.yahoo.com/index.js
// (e.g., CustomExtractor => NYTimesExtractor)
// then add your new extractor to
// src/extractors/all.js
const url =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const extractor = getExtractor(url);
assert.equal(extractor.domain, URL.parse(url).hostname);
});
it('returns the title', ((async)) () => {
// To pass this test, fill out the title selector
// in ./src/extractors/custom/www.yahoo.com/index.js.
const html =
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
const articleUrl =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const { title } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(title, 'Clinton Cancels Joint Events with Sanders');
});
it('returns the author', ((async)) () => {
// To pass this test, fill out the author selector
// in ./src/extractors/custom/www.yahoo.com/index.js.
const html =
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
const articleUrl =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const { author } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(author, 'Fox Nation');
});
it('returns the date_published', ((async)) () => {
// To pass this test, fill out the date_published selector
// in ./src/extractors/custom/www.yahoo.com/index.js.
const html =
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
const articleUrl =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const { date_published } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(date_published, '2016-10-03T04:00:00.000Z');
});
it('returns the dek', ((async)) () => {
// To pass this test, fill out the dek selector
// in ./src/extractors/custom/www.yahoo.com/index.js.
const html =
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
const articleUrl =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const { dek } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(dek, 'The Hillary Clinton campaign has canceled joint appearances with former primary opponent Bernie Sanders after he admitted that');
});
it('returns the lead_image_url', ((async)) () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.yahoo.com/index.js.
const html =
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
const articleUrl =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const { lead_image_url } =
await Mercury.parse(articleUrl, html, { fallback: false });
// Update these values with the expected values from
// the article.
assert.equal(lead_image_url, 'https://s.yimg.com/uu/api/res/1.2/tE8CoXSgHD15n5p8wUwGJA--/aD0zMDA7dz02MjQ7c209MTthcHBpZD15dGFjaHlvbg--/http://slingstone.zenfs.com/offnetwork/218c3f97f0b7e1598b6dc9fd10126e22');
});
it('returns the content', ((async)) () => {
// To pass this test, fill out the content selector
// in ./src/extractors/custom/www.yahoo.com/index.js.
// You may also want to make use of the clean and transform
// options.
const html =
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
const url =
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
const { content } =
await Mercury.parse(url, html, { fallback: false });
const $ = cheerio.load(content || '');
const first13 = $('*').first()
.text()
.trim()
.split(/\s+/)
.slice(0, 13)
.join(' ');
// Update these values with the expected values from
// the article.
assert.equal(first13, 'The Hillary Clinton campaign has canceled joint appearances with former primary opponent Bernie');
});
});
Loading…
Cancel
Save