chore: refactor format output adjustments (#272)

I had previously done this in an overly complicated manner. This PR cleans it up a bit.
5 years ago · 9bf88b0ba3
parent 867623ab33
commit 9bf88b0ba3
6 changed files with 64 additions and 139 deletions
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -8,6 +8,7 @@ var _objectWithoutProperties = _interopDefault(require('@babel/runtime-corejs2/h
 var _asyncToGenerator = _interopDefault(require('@babel/runtime-corejs2/helpers/asyncToGenerator'));
 var URL = _interopDefault(require('url'));
 var cheerio = _interopDefault(require('cheerio'));
+var TurndownService = _interopDefault(require('turndown'));
 var iconv = _interopDefault(require('iconv-lite'));
 var _parseInt = _interopDefault(require('@babel/runtime-corejs2/core-js/parse-int'));
 var _slicedToArray = _interopDefault(require('@babel/runtime-corejs2/helpers/slicedToArray'));
@ -21,7 +22,6 @@ var _Set = _interopDefault(require('@babel/runtime-corejs2/core-js/set'));
 var _typeof = _interopDefault(require('@babel/runtime-corejs2/helpers/typeof'));
 var _getIterator = _interopDefault(require('@babel/runtime-corejs2/core-js/get-iterator'));
 var _Object$keys = _interopDefault(require('@babel/runtime-corejs2/core-js/object/keys'));
-var TurndownService = _interopDefault(require('turndown'));
 var stringDirection = _interopDefault(require('string-direction'));
 var validUrl = _interopDefault(require('valid-url'));
 var moment = _interopDefault(require('moment-timezone'));
@ -6018,9 +6018,7 @@ var GenericExtractor = {
  },
  extract: function extract(options) {
    var html = options.html,
-        $ = options.$,
-        _options$contentType = options.contentType,
-        contentType = _options$contentType === void 0 ? 'html' : _options$contentType;
+        $ = options.$;

    if (html && !$) {
      var loaded = cheerio.load(html);
@ -6054,24 +6052,13 @@ var GenericExtractor = {
        url = _this$url_and_domain.url,
        domain = _this$url_and_domain.domain;

-    var convertedContent;
-
-    if (contentType === 'html') {
-      convertedContent = content;
-    } else if (contentType === 'text') {
-      convertedContent = $.text(cheerio.load(content));
-    } else if (contentType === 'markdown') {
-      var turndownService = new TurndownService();
-      convertedContent = turndownService.turndown(content);
-    }
-
    return {
      title: title,
      author: author,
      date_published: date_published || null,
      dek: dek,
      lead_image_url: lead_image_url,
-      content: convertedContent,
+      content: content,
      next_page_url: next_page_url,
      url: url,
      domain: domain,
@ -6161,9 +6148,7 @@ function select(opts) {
      type = opts.type,
      extractionOpts = opts.extractionOpts,
      _opts$extractHtml = opts.extractHtml,
-      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml,
-      _opts$contentType = opts.contentType,
-      contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType; // Skip if there's not extraction for this type
+      extractHtml = _opts$extractHtml === void 0 ? false : _opts$extractHtml; // Skip if there's not extraction for this type

  if (!extractionOpts) return null; // If a string is hardcoded for a type (e.g., Wikipedia
  // contributors), return the string
@ -6205,19 +6190,7 @@ function select(opts) {
    $content = Cleaners[type]($content, _objectSpread({}, opts, {
      defaultCleaner: defaultCleaner
    }));
-
-    if (contentType === 'html') {
-      return $.html($content);
-    }
-
-    if (contentType === 'text') {
-      return $.text($content);
-    }
-
-    if (contentType === 'markdown') {
-      var turndownService = new TurndownService();
-      return turndownService.turndown($.html($content));
-    }
+    return $.html($content);
  }

  var result; // if selector is an array (e.g., ['img', 'src']),
@ -6270,9 +6243,7 @@ var RootExtractor = {
    var opts = arguments.length > 1 ? arguments[1] : undefined;
    var _opts = opts,
        contentOnly = _opts.contentOnly,
-        extractedTitle = _opts.extractedTitle,
-        _opts$contentType2 = _opts.contentType,
-        contentType = _opts$contentType2 === void 0 ? 'html' : _opts$contentType2; // This is the generic extractor. Run its extract method
+        extractedTitle = _opts.extractedTitle; // This is the generic extractor. Run its extract method

    if (extractor.domain === '*') return extractor.extract(opts);
    opts = _objectSpread({}, opts, {
@ -6283,8 +6254,7 @@ var RootExtractor = {
      var _content = extractResult(_objectSpread({}, opts, {
        type: 'content',
        extractHtml: true,
-        title: extractedTitle,
-        contentType: contentType
+        title: extractedTitle
      }));

      return {
@ -6451,6 +6421,7 @@ var Mercury = {
          _result,
          title,
          next_page_url,
+          turndownService,
          _args = arguments;

      return _regeneratorRuntime.wrap(function _callee$(_context) {
@ -6545,9 +6516,16 @@ var Mercury = {
              });

            case 23:
+              if (contentType === 'markdown') {
+                turndownService = new TurndownService();
+                result.content = turndownService.turndown(result.content);
+              } else if (contentType === 'text') {
+                result.content = $.text($(result.content));
+              }
+
              return _context.abrupt("return", result);

-            case 24:
+            case 25:
            case "end":
              return _context.stop();
          }
--- a/src/extractors/generic/index.js
+++ b/src/extractors/generic/index.js
@ -1,5 +1,4 @@
 import cheerio from 'cheerio';
-import TurndownService from 'turndown';
 import stringDirection from 'string-direction';

 import GenericContentExtractor from './content/extractor';
@ -29,7 +28,7 @@ const GenericExtractor = {
  direction: ({ title }) => stringDirection.getDirection(title),

  extract(options) {
-    const { html, $, contentType = 'html' } = options;
+    const { html, $ } = options;

    if (html && !$) {
      const loaded = cheerio.load(html);
@ -48,24 +47,13 @@ const GenericExtractor = {
    const direction = this.direction({ title });
    const { url, domain } = this.url_and_domain(options);

-    let convertedContent;
-
-    if (contentType === 'html') {
-      convertedContent = content;
-    } else if (contentType === 'text') {
-      convertedContent = $.text(cheerio.load(content));
-    } else if (contentType === 'markdown') {
-      const turndownService = new TurndownService();
-      convertedContent = turndownService.turndown(content);
-    }
-
    return {
      title,
      author,
      date_published: date_published || null,
      dek,
      lead_image_url,
-      content: convertedContent,
+      content,
      next_page_url,
      url,
      domain,
--- a/src/extractors/root-extractor.js
+++ b/src/extractors/root-extractor.js
@ -1,4 +1,3 @@
-import TurndownService from 'turndown';
 import Cleaners from 'cleaners';
 import { convertNodeTo } from 'utils/dom';
 import GenericExtractor from './generic';
@ -67,13 +66,7 @@ function findMatchingSelector($, selectors, extractHtml) {
 }

 export function select(opts) {
-  const {
-    $,
-    type,
-    extractionOpts,
-    extractHtml = false,
-    contentType = 'html',
-  } = opts;
+  const { $, type, extractionOpts, extractHtml = false } = opts;
  // Skip if there's not extraction for this type
  if (!extractionOpts) return null;

@ -120,16 +113,7 @@ export function select(opts) {

    $content = Cleaners[type]($content, { ...opts, defaultCleaner });

-    if (contentType === 'html') {
-      return $.html($content);
-    }
-    if (contentType === 'text') {
-      return $.text($content);
-    }
-    if (contentType === 'markdown') {
-      const turndownService = new TurndownService();
-      return turndownService.turndown($.html($content));
-    }
+    return $.html($content);
  }

  let result;
@ -178,7 +162,7 @@ function extractResult(opts) {

 const RootExtractor = {
  extract(extractor = GenericExtractor, opts) {
-    const { contentOnly, extractedTitle, contentType = 'html' } = opts;
+    const { contentOnly, extractedTitle } = opts;
    // This is the generic extractor. Run its extract method
    if (extractor.domain === '*') return extractor.extract(opts);

@ -193,7 +177,6 @@ const RootExtractor = {
        type: 'content',
        extractHtml: true,
        title: extractedTitle,
-        contentType,
      });
      return {
        content,
--- a/src/extractors/root-extractor.test.js
+++ b/src/extractors/root-extractor.test.js
@ -32,73 +32,6 @@ describe('RootExtractor', () => {

    assert.equal(url, null);
  });
-  it('returns text content if text is passed as contentType', () => {
-    const fullUrl =
-      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
-    const html = fs.readFileSync(
-      './src/extractors/custom/nymag.com/fixtures/test.html',
-      'utf8'
-    );
-    const $ = cheerio.load(html);
-
-    const { content } = RootExtractor.extract(NYMagExtractor, {
-      url: fullUrl,
-      html,
-      $,
-      metaCache: [],
-      fallback: false,
-      contentType: 'text',
-    });
-
-    const htmlRe = /<[a-z][\s\S]*>/g;
-
-    assert.equal(htmlRe.test(content), false);
-  });
-  it('returns markdown if markdown is passed as contentType', () => {
-    const fullUrl =
-      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
-    const html = fs.readFileSync(
-      './src/extractors/custom/nymag.com/fixtures/test.html',
-      'utf8'
-    );
-    const $ = cheerio.load(html);
-
-    const { content } = RootExtractor.extract(NYMagExtractor, {
-      url: fullUrl,
-      html,
-      $,
-      metaCache: [],
-      fallback: false,
-      contentType: 'markdown',
-    });
-
-    const htmlRe = /<[a-z][\s\S]*>/;
-    const markdownRe = /\[[\w\s]+\]\(.*\)/;
-
-    assert.equal(htmlRe.test(content), false);
-    assert.equal(markdownRe.test(content), true);
-  });
-  it('also can select type on Generic Extractor', () => {
-    const fullUrl =
-      'http://www.vulture.com/2016/08/dc-comics-greg-berlanti-c-v-r.html';
-
-    const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
-    const $ = cheerio.load(html);
-    const { content } = RootExtractor.extract(undefined, {
-      url: fullUrl,
-      html,
-      $,
-      metaCache: [],
-      fallback: false,
-      contentType: 'markdown',
-    });
-
-    const htmlRe = /<[a-z][\s\S]*>/;
-    const markdownRe = /\[[\w\s]+\]\(.*\)/;
-
-    assert.equal(htmlRe.test(content), false);
-    assert.equal(markdownRe.test(content), true);
-  });
 });

 describe('cleanBySelectors($content, $, { clean })', () => {
--- a/src/mercury.js
+++ b/src/mercury.js
@ -1,5 +1,6 @@
 import URL from 'url';
 import cheerio from 'cheerio';
+import TurndownService from 'turndown';

 import Resource from 'resource';
 import { validateUrl, Errors } from 'utils';
@ -83,6 +84,13 @@ const Mercury = {
      };
    }

+    if (contentType === 'markdown') {
+      const turndownService = new TurndownService();
+      result.content = turndownService.turndown(result.content);
+    } else if (contentType === 'text') {
+      result.content = $.text($(result.content));
+    }
+
    return result;
  },

--- a/src/mercury.test.js
+++ b/src/mercury.test.js
@ -4,6 +4,8 @@ import { Errors } from 'utils';
 import { record } from 'test-helpers';
 import Mercury from './mercury';

+const fs = require('fs');
+
 describe('Mercury', () => {
  const recorder = record('mercury-test');
  beforeAll(recorder.before);
@ -92,4 +94,37 @@ describe('Mercury', () => {
      assert.equal(result.next_page_url, `${url}2`);
    });
  });
+
+  it('returns text content if text is passed as contentType', async () => {
+    const url =
+      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
+    const html = fs.readFileSync(
+      './src/extractors/custom/nymag.com/fixtures/test.html',
+      'utf8'
+    );
+    const { content } = await Mercury.parse(url, { html, contentType: 'text' });
+
+    const htmlRe = /<[a-z][\s\S]*>/g;
+
+    assert.equal(htmlRe.test(content), false);
+  });
+
+  it('returns markdown if markdown is passed as contentType', async () => {
+    const url =
+      'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
+    const html = fs.readFileSync(
+      './src/extractors/custom/nymag.com/fixtures/test.html',
+      'utf8'
+    );
+    const { content } = await Mercury.parse(url, {
+      html,
+      contentType: 'markdown',
+    });
+
+    const htmlRe = /<[a-z][\s\S]*>/;
+    const markdownRe = /\[[\w\s]+\]\(.*\)/;
+
+    assert.equal(htmlRe.test(content), false);
+    assert.equal(markdownRe.test(content), true);
+  });
 });