Merge remote-tracking branch 'origin/master'
commit
84619641b0
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,61 @@
|
||||
// Rename CustomExtractor
|
||||
// to fit your publication
|
||||
// (e.g., NYTimesExtractor)
|
||||
export const WikiaExtractor = {
|
||||
domain: 'fandom.wikia.com',
|
||||
title: {
|
||||
selectors: [
|
||||
'h1.entry-title',
|
||||
// enter title selectors
|
||||
],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'.author vcard', '.fn',
|
||||
// enter author selectors
|
||||
],
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
'.grid-content',
|
||||
'.entry-content',
|
||||
// enter content selectors
|
||||
],
|
||||
|
||||
// Is there anything in the content you selected that needs transformed
|
||||
// before it's consumable content? E.g., unusual lazy loaded images
|
||||
transforms: [
|
||||
],
|
||||
|
||||
// Is there anything that is in the result that shouldn't be?
|
||||
// The clean selectors will remove anything that matches from
|
||||
// the result
|
||||
clean: [
|
||||
|
||||
],
|
||||
},
|
||||
|
||||
date_published: {
|
||||
selectors: [
|
||||
['meta[name="article:published_time"]', 'value'],
|
||||
],
|
||||
},
|
||||
|
||||
lead_image_url: {
|
||||
selectors: [
|
||||
['meta[name="og:image"]', 'value'],
|
||||
],
|
||||
},
|
||||
|
||||
dek: {
|
||||
selectors: [
|
||||
['meta[name="og:description"]', 'value'],
|
||||
],
|
||||
},
|
||||
|
||||
next_page_url: null,
|
||||
|
||||
excerpt: null,
|
||||
};
|
@ -0,0 +1,62 @@
|
||||
// Rename CustomExtractor
|
||||
// to fit your publication
|
||||
// (e.g., NYTimesExtractor)
|
||||
export const BuzzfeedExtractor = {
|
||||
domain: 'www.buzzfeed.com',
|
||||
title: {
|
||||
selectors: [
|
||||
'h1[id="post-title"]',
|
||||
// enter title selectors
|
||||
],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'a[data-action="user/username"]', 'byline__author',
|
||||
// enter author selectors
|
||||
],
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
'#buzz_sub_buzz', '.bf_dom', 'div[rel:gt_cat="[ttp]:content"]',
|
||||
|
||||
// enter content selectors
|
||||
],
|
||||
|
||||
// Is there anything in the content you selected that needs transformed
|
||||
// before it's consumable content? E.g., unusual lazy loaded images
|
||||
transforms: [
|
||||
],
|
||||
|
||||
// Is there anything that is in the result that shouldn't be?
|
||||
// The clean selectors will remove anything that matches from
|
||||
// the result
|
||||
clean: [
|
||||
|
||||
],
|
||||
},
|
||||
|
||||
date_published: {
|
||||
selectors: [
|
||||
'.buzz-datetime',
|
||||
// enter author selectors
|
||||
],
|
||||
},
|
||||
|
||||
lead_image_url: {
|
||||
selectors: [
|
||||
['meta[name="og:image"]', 'value'],
|
||||
],
|
||||
},
|
||||
|
||||
dek: {
|
||||
selectors: [
|
||||
['meta[name="description"]', 'value'],
|
||||
],
|
||||
},
|
||||
|
||||
next_page_url: null,
|
||||
|
||||
excerpt: null,
|
||||
};
|
@ -0,0 +1,134 @@
|
||||
import assert from 'assert';
|
||||
import fs from 'fs';
|
||||
import URL from 'url';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import Mercury from 'mercury';
|
||||
import getExtractor from 'extractors/get-extractor';
|
||||
|
||||
// Rename CustomExtractor
|
||||
describe('BuzzfeedExtractor', () => {
|
||||
it('is selected properly', () => {
|
||||
// To pass this test, rename your extractor in
|
||||
// ./src/extractors/custom/www.buzzfeed.com/index.js
|
||||
// (e.g., CustomExtractor => NYTimesExtractor)
|
||||
// then add your new extractor to
|
||||
// src/extractors/all.js
|
||||
const url =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
const extractor = getExtractor(url);
|
||||
assert.equal(extractor.domain, URL.parse(url).hostname);
|
||||
});
|
||||
|
||||
it('returns the title', ((async)) () => {
|
||||
// To pass this test, fill out the title selector
|
||||
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
|
||||
const articleUrl =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
|
||||
const { title } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(title, 'People Are Calling Out This Edited Picture Of Demi Lovato For Body-Shaming Her');
|
||||
});
|
||||
|
||||
|
||||
it('returns the author', ((async)) () => {
|
||||
// To pass this test, fill out the author selector
|
||||
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
|
||||
const articleUrl =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
|
||||
const { author } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(author, 'Ikran Dahir');
|
||||
});
|
||||
|
||||
|
||||
it('returns the date_published', ((async)) () => {
|
||||
// To pass this test, fill out the date_published selector
|
||||
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
|
||||
const articleUrl =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
|
||||
const { date_published } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(date_published, ' ');
|
||||
});
|
||||
|
||||
|
||||
it('returns the dek', ((async)) () => {
|
||||
// To pass this test, fill out the dek selector
|
||||
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
|
||||
const articleUrl =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
|
||||
const { dek } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(dek, 'Lovato said: "Is that how my boobs should look?"..');
|
||||
});
|
||||
|
||||
|
||||
it('returns the lead_image_url', ((async)) () => {
|
||||
// To pass this test, fill out the lead_image_url selector
|
||||
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
|
||||
const articleUrl =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
|
||||
const { lead_image_url } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(lead_image_url, 'https://img.buzzfeed.com/buzzfeed-static/static/2016-10/3/12/social_promotion/buzzfeed-prod-fastlane01/facebook-social-promotion-17757-1475512210-1.jpg');
|
||||
});
|
||||
|
||||
|
||||
it('returns the content', ((async)) () => {
|
||||
// To pass this test, fill out the content selector
|
||||
// in ./src/extractors/custom/www.buzzfeed.com/index.js.
|
||||
// You may also want to make use of the clean and transform
|
||||
// options.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.buzzfeed.com/1475531975121.html');
|
||||
const url =
|
||||
'https://www.buzzfeed.com/ikrd/people-are-calling-out-this-edited-picture-of-demi-lovato-fo';
|
||||
|
||||
const { content } =
|
||||
await Mercury.parse(url, html, { fallback: false });
|
||||
|
||||
const $ = cheerio.load(content || '');
|
||||
|
||||
const first13 = $('*').first()
|
||||
.text()
|
||||
.trim()
|
||||
.split(/\s+/)
|
||||
.slice(0, 13)
|
||||
.join(' ');
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(first13, 'A few months ago, Vladimir Serbanescu, a 17-year-old artist from Romania, drew this');
|
||||
});
|
||||
});
|
@ -0,0 +1,62 @@
|
||||
// Rename CustomExtractor
|
||||
// to fit your publication
|
||||
// (e.g., NYTimesExtractor)
|
||||
export const YahooExtractor = {
|
||||
domain: 'www.yahoo.com',
|
||||
title: {
|
||||
selectors: [
|
||||
'header.canvas-header',
|
||||
// enter title selectors
|
||||
],
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'span.provider-name',
|
||||
// enter author selectors
|
||||
],
|
||||
},
|
||||
|
||||
content: {
|
||||
selectors: [
|
||||
// enter content selectors
|
||||
'.content-canvas',
|
||||
],
|
||||
|
||||
// Is there anything in the content you selected that needs transformed
|
||||
// before it's consumable content? E.g., unusual lazy loaded images
|
||||
transforms: [
|
||||
],
|
||||
|
||||
// Is there anything that is in the result that shouldn't be?
|
||||
// The clean selectors will remove anything that matches from
|
||||
// the result
|
||||
clean: [
|
||||
'.figure-caption',
|
||||
|
||||
],
|
||||
},
|
||||
|
||||
date_published: {
|
||||
selectors: [
|
||||
'time.date',
|
||||
],
|
||||
},
|
||||
|
||||
lead_image_url: {
|
||||
selectors: [
|
||||
['meta[name="og:image"]', 'value'],
|
||||
],
|
||||
},
|
||||
|
||||
dek: {
|
||||
selectors: [
|
||||
['meta[name="og:description"]', 'value'],
|
||||
// enter dek selectors
|
||||
],
|
||||
},
|
||||
|
||||
next_page_url: null,
|
||||
|
||||
excerpt: null,
|
||||
};
|
@ -0,0 +1,134 @@
|
||||
import assert from 'assert';
|
||||
import fs from 'fs';
|
||||
import URL from 'url';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import Mercury from 'mercury';
|
||||
import getExtractor from 'extractors/get-extractor';
|
||||
|
||||
// Rename CustomExtractor
|
||||
describe('YahooExtractor', () => {
|
||||
it('is selected properly', () => {
|
||||
// To pass this test, rename your extractor in
|
||||
// ./src/extractors/custom/www.yahoo.com/index.js
|
||||
// (e.g., CustomExtractor => NYTimesExtractor)
|
||||
// then add your new extractor to
|
||||
// src/extractors/all.js
|
||||
const url =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
const extractor = getExtractor(url);
|
||||
assert.equal(extractor.domain, URL.parse(url).hostname);
|
||||
});
|
||||
|
||||
it('returns the title', ((async)) () => {
|
||||
// To pass this test, fill out the title selector
|
||||
// in ./src/extractors/custom/www.yahoo.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
|
||||
const articleUrl =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
|
||||
const { title } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(title, 'Clinton Cancels Joint Events with Sanders');
|
||||
});
|
||||
|
||||
|
||||
it('returns the author', ((async)) () => {
|
||||
// To pass this test, fill out the author selector
|
||||
// in ./src/extractors/custom/www.yahoo.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
|
||||
const articleUrl =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
|
||||
const { author } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(author, 'Fox Nation');
|
||||
});
|
||||
|
||||
|
||||
it('returns the date_published', ((async)) () => {
|
||||
// To pass this test, fill out the date_published selector
|
||||
// in ./src/extractors/custom/www.yahoo.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
|
||||
const articleUrl =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
|
||||
const { date_published } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(date_published, '2016-10-03T04:00:00.000Z');
|
||||
});
|
||||
|
||||
|
||||
it('returns the dek', ((async)) () => {
|
||||
// To pass this test, fill out the dek selector
|
||||
// in ./src/extractors/custom/www.yahoo.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
|
||||
const articleUrl =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
|
||||
const { dek } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(dek, 'The Hillary Clinton campaign has canceled joint appearances with former primary opponent Bernie Sanders after he admitted that');
|
||||
});
|
||||
|
||||
|
||||
it('returns the lead_image_url', ((async)) () => {
|
||||
// To pass this test, fill out the lead_image_url selector
|
||||
// in ./src/extractors/custom/www.yahoo.com/index.js.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
|
||||
const articleUrl =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
|
||||
const { lead_image_url } =
|
||||
await Mercury.parse(articleUrl, html, { fallback: false });
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(lead_image_url, 'https://s.yimg.com/uu/api/res/1.2/tE8CoXSgHD15n5p8wUwGJA--/aD0zMDA7dz02MjQ7c209MTthcHBpZD15dGFjaHlvbg--/http://slingstone.zenfs.com/offnetwork/218c3f97f0b7e1598b6dc9fd10126e22');
|
||||
});
|
||||
|
||||
|
||||
it('returns the content', ((async)) () => {
|
||||
// To pass this test, fill out the content selector
|
||||
// in ./src/extractors/custom/www.yahoo.com/index.js.
|
||||
// You may also want to make use of the clean and transform
|
||||
// options.
|
||||
const html =
|
||||
fs.readFileSync('./fixtures/www.yahoo.com/1475529982399.html');
|
||||
const url =
|
||||
'https://www.yahoo.com/news/m/1c621104-b0eb-3b4d-9b0a-7bb979f80d7d/ss_clinton-cancels-joint-events.html';
|
||||
|
||||
const { content } =
|
||||
await Mercury.parse(url, html, { fallback: false });
|
||||
|
||||
const $ = cheerio.load(content || '');
|
||||
|
||||
const first13 = $('*').first()
|
||||
.text()
|
||||
.trim()
|
||||
.split(/\s+/)
|
||||
.slice(0, 13)
|
||||
.join(' ');
|
||||
|
||||
// Update these values with the expected values from
|
||||
// the article.
|
||||
assert.equal(first13, 'The Hillary Clinton campaign has canceled joint appearances with former primary opponent Bernie');
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue