feat: added wired custom extractor

8 years ago · 7ecc696248
parent 20b7c5a8b6
commit 7ecc696248
6 changed files with 263 additions and 19 deletions
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -26,8 +26,8 @@ var ellipsize = _interopDefault(require('ellipsize'));
 var _marked = [range].map(_regeneratorRuntime.mark);

 function range() {
-  var start = arguments.length <= 0 || arguments[0] === undefined ? 1 : arguments[0];
-  var end = arguments.length <= 1 || arguments[1] === undefined ? 1 : arguments[1];
+  var start = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : 1;
+  var end = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 1;
  return _regeneratorRuntime.wrap(function range$(_context) {
    while (1) {
      switch (_context.prev = _context.next) {
@ -101,7 +101,7 @@ function get(options) {
 // further processing of this url.

 function validateResponse(response) {
-  var parseNon2xx = arguments.length <= 1 || arguments[1] === undefined ? false : arguments[1];
+  var parseNon2xx = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : false;

  // Check if we got a valid status code
  if (response.statusMessage !== 'OK') {
@ -627,6 +627,49 @@ var NewYorkerExtractor = {
  excerpt: null
 };

+// Rename CustomExtractor
+// to fit your publication
+// (e.g., NYTimesExtractor)
+var WiredExtractor = {
+  domain: 'www.wired.com',
+  title: {
+    selectors: ['h1.post-title']
+  },
+
+  author: {
+    selectors: ['a[rel="author"]']
+  },
+
+  content: {
+    selectors: ['article.content'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: [],
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.visually-hidden']
+  },
+
+  date_published: {
+    selectors: [['meta[itemprop="datePublished"]', 'value']]
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  dek: {
+    selectors: [['meta[name="og:description"]', 'value']]
+  },
+
+  next_page_url: null,
+
+  excerpt: null
+};
+
 var Extractors = {
  'nymag.com': NYMagExtractor,
  'blogspot.com': BloggerExtractor,
@ -634,7 +677,9 @@ var Extractors = {
  'twitter.com': TwitterExtractor,
  'www.nytimes.com': NYTimesExtractor,
  'www.theatlantic.com': TheAtlanticExtractor,
-  'www.newyorker.com': NewYorkerExtractor
+  'www.newyorker.com': NewYorkerExtractor,
+  'www.wired.com': WiredExtractor
+
 };

 // Spacer images to be removed
@ -822,7 +867,7 @@ function brsToPs($) {
 // :param br: Whether or not the passed node is a br

 function paragraphize(node, $) {
-  var br = arguments.length <= 2 || arguments[2] === undefined ? false : arguments[2];
+  var br = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : false;

  var $node = $(node);

@ -892,7 +937,7 @@ function convertToParagraphs($) {
 }

 function convertNodeTo($node, $) {
-  var tag = arguments.length <= 2 || arguments[2] === undefined ? 'p' : arguments[2];
+  var tag = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 'p';

  var node = $node.get(0);
  if (!node) {
@ -952,7 +997,7 @@ function cleanImages($article, $) {
 }

 function stripJunkTags(article, $) {
-  var tags = arguments.length <= 2 || arguments[2] === undefined ? [] : arguments[2];
+  var tags = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : [];

  if (tags.length === 0) {
    tags = STRIP_OUTPUT_TAGS;
@ -1165,7 +1210,7 @@ function scoreCommas(text) {
 var idkRe = new RegExp('^(p|pre)$', 'i');

 function scoreLength(textLength) {
-  var tagName = arguments.length <= 1 || arguments[1] === undefined ? 'p' : arguments[1];
+  var tagName = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : 'p';

  var chunks = textLength / 50;

@ -1249,7 +1294,7 @@ function addToParent(node, $, score) {
 // if not, initializes a score based on
 // the node's tag type
 function getOrInitScore($node, $) {
-  var weightNodes = arguments.length <= 2 || arguments[2] === undefined ? true : arguments[2];
+  var weightNodes = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : true;

  var score = getScore($node);

@ -1339,7 +1384,7 @@ function scorePs($, weightNodes) {
 // score content. Parents get the full value of their children's
 // content score, grandparents half
 function scoreContent($) {
-  var weightNodes = arguments.length <= 1 || arguments[1] === undefined ? true : arguments[1];
+  var weightNodes = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : true;

  // First, look for special hNews based selectors and give them a big
  // boost, if they exist
@ -1709,7 +1754,7 @@ function cleanTags($article, $) {
 }

 function cleanHeaders($article, $) {
-  var title = arguments.length <= 2 || arguments[2] === undefined ? '' : arguments[2];
+  var title = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : '';

  $(HEADER_TAG_LIST, $article).each(function (index, header) {
    var $header = $(header);
@ -1794,7 +1839,7 @@ function linkDensity($node) {
 // search for, find a meta tag associated.

 function extractFromMeta($, metaNames, cachedNames) {
-  var cleanTags = arguments.length <= 3 || arguments[3] === undefined ? true : arguments[3];
+  var cleanTags = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;

  var foundNames = metaNames.filter(function (name) {
    return cachedNames.indexOf(name) !== -1;
@ -1885,8 +1930,8 @@ function isGoodNode($node, maxChildren) {
 // be extractable from the document. This is for flat
 // meta-information, like author, title, date published, etc.
 function extractFromSelectors($, selectors) {
-  var maxChildren = arguments.length <= 2 || arguments[2] === undefined ? 1 : arguments[2];
-  var textOnly = arguments.length <= 3 || arguments[3] === undefined ? true : arguments[3];
+  var maxChildren = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 1;
+  var textOnly = arguments.length > 3 && arguments[3] !== undefined ? arguments[3] : true;
  var _iteratorNormalCompletion = true;
  var _didIteratorError = false;
  var _iteratorError = undefined;
@ -2214,7 +2259,7 @@ function cleanDomainFromTitle(splitTitle, url) {
 // Given a title with separators in it (colons, dashes, etc),
 // resolve whether any of the segments should be removed.
 function resolveSplitTitle(title) {
-  var url = arguments.length <= 1 || arguments[1] === undefined ? '' : arguments[1];
+  var url = arguments.length > 1 && arguments[1] !== undefined ? arguments[1] : '';

  // Splits while preserving splitters, like:
  // ['The New New York', ' - ', 'The Washington Post']
@ -3236,7 +3281,7 @@ var GenericUrlExtractor = {
 var EXCERPT_META_SELECTORS = ['og:description', 'twitter:description'];

 function clean$2(content, $) {
-  var maxLength = arguments.length <= 2 || arguments[2] === undefined ? 200 : arguments[2];
+  var maxLength = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : 200;

  content = content.replace(/[\s\n]+/g, ' ').trim();
  return ellipsize(content, maxLength, { ellipse: '&hellip;' });
@ -3488,7 +3533,7 @@ function extractResult(opts) {

 var RootExtractor = {
  extract: function extract() {
-    var extractor = arguments.length <= 0 || arguments[0] === undefined ? GenericExtractor : arguments[0];
+    var extractor = arguments.length > 0 && arguments[0] !== undefined ? arguments[0] : GenericExtractor;
    var opts = arguments[1];
    var _opts = opts;
    var contentOnly = _opts.contentOnly;
@ -3628,7 +3673,7 @@ var Mercury = {
  parse: function parse(url, html) {
    var _this = this;

-    var opts = arguments.length <= 2 || arguments[2] === undefined ? {} : arguments[2];
+    var opts = arguments.length > 2 && arguments[2] !== undefined ? arguments[2] : {};
    return _asyncToGenerator(_regeneratorRuntime.mark(function _callee() {
      var _opts$fetchAllPages, fetchAllPages, _opts$fallback, fallback, parsedUrl, Extractor, $, metaCache, result, _result, title, next_page_url;

--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
--- a/fixtures/www.wired.com/1475256747028.html
+++ b/fixtures/www.wired.com/1475256747028.html
--- a/src/extractors/all.js
+++ b/src/extractors/all.js
@ -5,6 +5,7 @@ import { TwitterExtractor } from './custom/twitter.com';
 import { NYTimesExtractor } from './custom/www.nytimes.com';
 import { TheAtlanticExtractor } from './custom/www.theatlantic.com';
 import { NewYorkerExtractor } from './custom/www.newyorker.com';
+import { WiredExtractor } from './custom/www.wired.com';

 const Extractors = {
  'nymag.com': NYMagExtractor,
@ -14,6 +15,8 @@ const Extractors = {
  'www.nytimes.com': NYTimesExtractor,
  'www.theatlantic.com': TheAtlanticExtractor,
  'www.newyorker.com': NewYorkerExtractor,
+  'www.wired.com': WiredExtractor,
+
 };

 export default Extractors;
--- a/src/extractors/custom/www.wired.com/index.js
+++ b/src/extractors/custom/www.wired.com/index.js
@ -0,0 +1,61 @@
+// Rename CustomExtractor
+// to fit your publication
+// (e.g., NYTimesExtractor)
+export const WiredExtractor = {
+  domain: 'www.wired.com',
+  title: {
+    selectors: [
+      'h1.post-title',
+      // enter title selectors
+    ],
+  },
+
+  author: {
+    selectors: [
+      'a[rel="author"]',
+      // enter author selectors
+    ],
+  },
+
+  content: {
+    selectors: [
+      'article.content',
+      // enter content selectors
+    ],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: [
+    ],
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: [
+      '.visually-hidden',
+
+    ],
+  },
+
+  date_published: {
+    selectors: [
+      ['meta[itemprop="datePublished"]', 'value'],
+    ],
+  },
+
+  lead_image_url: {
+    selectors: [
+      ['meta[name="og:image"]', 'value'],
+    ],
+  },
+
+  dek: {
+    selectors: [
+      ['meta[name="og:description"]', 'value'],
+    ],
+  },
+
+  next_page_url: null,
+
+  excerpt: null,
+};
--- a/src/extractors/custom/www.wired.com/index.test.js
+++ b/src/extractors/custom/www.wired.com/index.test.js
@ -0,0 +1,134 @@
+import assert from 'assert';
+import fs from 'fs';
+import URL from 'url';
+import cheerio from 'cheerio';
+
+import Mercury from 'mercury';
+import getExtractor from 'extractors/get-extractor';
+
+// Rename CustomExtractor
+describe('WiredExtractor', () => {
+  it('is selected properly', () => {
+    // To pass this test, rename your extractor in
+    // ./src/extractors/custom/www.wired.com/index.js
+    // (e.g., CustomExtractor => NYTimesExtractor)
+    // then add your new extractor to
+    // src/extractors/all.js
+    const url =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+    const extractor = getExtractor(url);
+    assert.equal(extractor.domain, URL.parse(url).hostname);
+  });
+
+  it('returns the title', ((async)) () => {
+    // To pass this test, fill out the title selector
+    // in ./src/extractors/custom/www.wired.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
+    const articleUrl =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+
+    const { title } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(title, 'An Ode to the Rosetta Spacecraft as It Flings Itself Into a Comet');
+  });
+
+
+  it('returns the author', ((async)) () => {
+    // To pass this test, fill out the author selector
+    // in ./src/extractors/custom/www.wired.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
+    const articleUrl =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+
+    const { author } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(author, 'Emma Grey Ellis');
+  });
+
+
+  it('returns the date_published', ((async)) () => {
+    // To pass this test, fill out the date_published selector
+    // in ./src/extractors/custom/www.wired.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
+    const articleUrl =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+
+    const { date_published } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(date_published, '2016-09-30T07:00:12.000Z');
+  });
+
+
+  it('returns the dek', ((async)) () => {
+    // To pass this test, fill out the dek selector
+    // in ./src/extractors/custom/www.wired.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
+    const articleUrl =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+
+    const { dek } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(dek, 'Time to break out the tissues, space fans.');
+  });
+
+
+  it('returns the lead_image_url', ((async)) () => {
+    // To pass this test, fill out the lead_image_url selector
+    // in ./src/extractors/custom/www.wired.com/index.js.
+    const html =
+      fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
+    const articleUrl =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+
+    const { lead_image_url } =
+      await Mercury.parse(articleUrl, html, { fallback: false });
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(lead_image_url, 'https://www.wired.com/wp-content/uploads/2016/09/Rosetta_impact-1-1200x630.jpg');
+  });
+
+
+  it('returns the content', ((async)) () => {
+    // To pass this test, fill out the content selector
+    // in ./src/extractors/custom/www.wired.com/index.js.
+    // You may also want to make use of the clean and transform
+    // options.
+    const html =
+      fs.readFileSync('./fixtures/www.wired.com/1475256747028.html');
+    const url =
+      'https://www.wired.com/2016/09/ode-rosetta-spacecraft-going-die-comet/';
+
+    const { content } =
+      await Mercury.parse(url, html, { fallback: false });
+
+    const $ = cheerio.load(content || '');
+
+    const first13 = $('*').first()
+                          .text()
+                          .trim()
+                          .split(/\s+/)
+                          .slice(0, 13)
+                          .join(' ');
+
+    // Update these values with the expected values from
+    // the article.
+    assert.equal(first13, 'Today, the European Space Agency’s Rosetta spacecraft will engage its thrusters for one');
+  });
+});