feat: remove obsolete custom extractors (#712)

2 years ago · 7b68bcd94c
parent 4981355628
commit 7b68bcd94c
12 changed files with 67 additions and 581 deletions
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -2227,41 +2227,29 @@ var LittleThingsExtractor = {
  excerpt: null
 };

-// Rename CustomExtractor
-// to fit your publication
-// (e.g., NYTimesExtractor)
 var PoliticoExtractor = {
  domain: 'www.politico.com',
  title: {
-    selectors: [// enter title selectors
-    ['meta[name="og:title"]', 'value']]
+    selectors: [['meta[name="og:title"]', 'value']]
  },
  author: {
-    selectors: ['.story-main-content .byline .vcard']
+    selectors: [['div[itemprop="author"] meta[itemprop="name"]', 'value'], '.story-meta__authors .vcard', '.story-main-content .byline .vcard']
  },
  content: {
-    selectors: [// enter content selectors
-    '.story-main-content', '.content-group', '.story-core', '.story-text'],
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
+    selectors: [['.story-text'], '.story-main-content', '.story-core'],
    transforms: [],
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: ['figcaption']
+    clean: ['figcaption', '.story-meta', '.ad']
  },
  date_published: {
-    selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
+    selectors: [['time[itemprop="datePublished"]', 'datetime'], ['.story-meta__details time[datetime]', 'datetime'], ['.story-main-content .timestamp time[datetime]', 'datetime']],
+    timezone: 'America/New_York'
  },
  lead_image_url: {
-    selectors: [// enter lead_image_url selectors
-    ['meta[name="og:image"]', 'value']]
+    selectors: [['meta[name="og:image"]', 'value']]
  },
  dek: {
-    selectors: []
-  },
-  next_page_url: null,
-  excerpt: null
+    selectors: [['meta[name="og:description"]', 'value']]
+  }
 };

 var DeadspinExtractor = {
@ -3980,33 +3968,6 @@ var WwwCnetComExtractor = {
  }
 };

-var WwwCinemablendComExtractor = {
-  domain: 'www.cinemablend.com',
-  title: {
-    selectors: ['.story_title']
-  },
-  author: {
-    selectors: ['.author']
-  },
-  date_published: {
-    selectors: [['meta[name="article:published_time"]', 'value']],
-    timezone: 'EST'
-  },
-  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']]
-  },
-  content: {
-    selectors: ['div#wrap_left_content'],
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {},
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: []
-  }
-};
-
 var WwwTodayComExtractor = {
  domain: 'www.today.com',
  title: {
@ -4033,33 +3994,6 @@ var WwwTodayComExtractor = {
  }
 };

-var WwwHowtogeekComExtractor = {
-  domain: 'www.howtogeek.com',
-  title: {
-    selectors: ['title']
-  },
-  author: {
-    selectors: ['#authorinfobox a']
-  },
-  date_published: {
-    selectors: ['#authorinfobox + div li'],
-    timezone: 'GMT'
-  },
-  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']]
-  },
-  content: {
-    selectors: ['.thecontent'],
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {},
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: []
-  }
-};
-
 var WwwAlComExtractor = {
  domain: 'www.al.com',
  title: {
@ -4286,33 +4220,6 @@ var ThoughtcatalogComExtractor = {
  }
 };

-var WwwNjComExtractor = {
-  domain: 'www.nj.com',
-  title: {
-    selectors: [['meta[name="title"]', 'value']]
-  },
-  author: {
-    selectors: [['meta[name="article_author"]', 'value']]
-  },
-  date_published: {
-    selectors: [['meta[name="article_date_original"]', 'value']],
-    timezone: 'America/New_York'
-  },
-  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']]
-  },
-  content: {
-    selectors: ['.entry-content'],
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {},
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: []
-  }
-};
-
 var WwwInquisitrComExtractor = {
  domain: 'www.inquisitr.com',
  title: {
@ -6185,14 +6092,66 @@ var PostlightComExtractor = {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
-    selectors: ['article.body'],
+    selectors: ['main.post'],
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['section.pl-post-link', 'aside', 'section.insights_featured_case_studies']
+  }
+};
+
+var WwwInvestmentexecutiveComExtractor = {
+  domain: 'www.investmentexecutive.com',
+  title: {
+    selectors: ['h1']
+  },
+  author: {
+    selectors: ['div[itemprop="author"]']
+  },
+  date_published: {
+    selectors: [['meta[itemprop="datePublished"]', 'value']]
+  },
+  dek: {
+    selectors: [['meta[name="og:description"]', 'value']]
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['section.article-body'],
+    clean: ['.hidden']
+  }
+};
+
+var WwwCbcCaExtractor = {
+  domain: 'www.cbc.ca',
+  title: {
+    selectors: ['h1']
+  },
+  author: {
+    selectors: ['.authorText', '.bylineDetails']
+  },
+  date_published: {
+    selectors: [['.timeStamp[datetime]', 'datetime']]
+  },
+  dek: {
+    selectors: ['.deck']
+  },
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+  content: {
+    selectors: ['.story'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['section.pl-post-link']
+    clean: []
  }
 };

@ -6265,9 +6224,7 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
  WwwSiComExtractor: WwwSiComExtractor,
  WwwRawstoryComExtractor: WwwRawstoryComExtractor,
  WwwCnetComExtractor: WwwCnetComExtractor,
-  WwwCinemablendComExtractor: WwwCinemablendComExtractor,
  WwwTodayComExtractor: WwwTodayComExtractor,
-  WwwHowtogeekComExtractor: WwwHowtogeekComExtractor,
  WwwAlComExtractor: WwwAlComExtractor,
  WwwThepennyhoarderComExtractor: WwwThepennyhoarderComExtractor,
  WwwWesternjournalismComExtractor: WwwWesternjournalismComExtractor,
@ -6276,7 +6233,6 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
  ScienceflyComExtractor: ScienceflyComExtractor,
  HellogigglesComExtractor: HellogigglesComExtractor,
  ThoughtcatalogComExtractor: ThoughtcatalogComExtractor,
-  WwwNjComExtractor: WwwNjComExtractor,
  WwwInquisitrComExtractor: WwwInquisitrComExtractor,
  WwwNbcnewsComExtractor: WwwNbcnewsComExtractor,
  FortuneComExtractor: FortuneComExtractor,
@ -6343,7 +6299,9 @@ var CustomExtractors = /*#__PURE__*/Object.freeze({
  ArstechnicaComExtractor: ArstechnicaComExtractor,
  WwwNdtvComExtractor: WwwNdtvComExtractor,
  SpektrumExtractor: SpektrumExtractor,
-  PostlightComExtractor: PostlightComExtractor
+  PostlightComExtractor: PostlightComExtractor,
+  WwwInvestmentexecutiveComExtractor: WwwInvestmentexecutiveComExtractor,
+  WwwCbcCaExtractor: WwwCbcCaExtractor
 });

 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js
--- a/fixtures/www.cinemablend.com/1482432215722.html
+++ b/fixtures/www.cinemablend.com/1482432215722.html
--- a/fixtures/www.howtogeek.com/1482438125052.html
+++ b/fixtures/www.howtogeek.com/1482438125052.html
--- a/fixtures/www.nj.com/1481666201503.html
+++ b/fixtures/www.nj.com/1481666201503.html
--- a/src/extractors/custom/index.js
+++ b/src/extractors/custom/index.js
@ -64,9 +64,7 @@ export * from './www.androidcentral.com';
 export * from './www.si.com';
 export * from './www.rawstory.com';
 export * from './www.cnet.com';
-export * from './www.cinemablend.com';
 export * from './www.today.com';
-export * from './www.howtogeek.com';
 export * from './www.al.com';
 export * from './www.thepennyhoarder.com';
 export * from './www.westernjournalism.com';
@ -75,7 +73,6 @@ export * from './www.americanow.com';
 export * from './sciencefly.com';
 export * from './hellogiggles.com';
 export * from './thoughtcatalog.com';
-export * from './www.nj.com';
 export * from './www.inquisitr.com';
 export * from './www.nbcnews.com';
 export * from './fortune.com';
--- a/src/extractors/custom/www.cinemablend.com/index.js
+++ b/src/extractors/custom/www.cinemablend.com/index.js
@ -1,34 +0,0 @@
-export const WwwCinemablendComExtractor = {
-  domain: 'www.cinemablend.com',
-
-  title: {
-    selectors: ['.story_title'],
-  },
-
-  author: {
-    selectors: ['.author'],
-  },
-
-  date_published: {
-    selectors: [['meta[name="article:published_time"]', 'value']],
-
-    timezone: 'EST',
-  },
-
-  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']],
-  },
-
-  content: {
-    selectors: ['div#wrap_left_content'],
-
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {},
-
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: [],
-  },
-};
--- a/src/extractors/custom/www.cinemablend.com/index.test.js
+++ b/src/extractors/custom/www.cinemablend.com/index.test.js
@ -1,102 +0,0 @@
-import assert from 'assert';
-import URL from 'url';
-import cheerio from 'cheerio';
-
-import Mercury from 'mercury';
-import getExtractor from 'extractors/get-extractor';
-import { excerptContent } from 'utils/text';
-
-const fs = require('fs');
-
-describe('WwwCinemablendComExtractor', () => {
-  describe('initial test case', () => {
-    let result;
-    let url;
-    beforeAll(() => {
-      url =
-        'http://www.cinemablend.com/news/1602870/to-3d-or-not-to-3d-buy-the-right-passengers-ticket';
-      const html = fs.readFileSync(
-        './fixtures/www.cinemablend.com/1482432215722.html'
-      );
-      result = Mercury.parse(url, { html, fallback: false });
-    });
-
-    it('is selected properly', () => {
-      // This test should be passing by default.
-      // It sanity checks that the correct parser
-      // is being selected for URLs from this domain
-      const extractor = getExtractor(url);
-      assert.equal(extractor.domain, URL.parse(url).hostname);
-    });
-
-    it('returns the title', async () => {
-      // To pass this test, fill out the title selector
-      // in ./src/extractors/custom/www.cinemablend.com/index.js.
-      const { title } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        title,
-        'To 3D Or Not To 3D: Buy The Right Passengers Ticket'
-      );
-    });
-
-    it('returns the author', async () => {
-      // To pass this test, fill out the author selector
-      // in ./src/extractors/custom/www.cinemablend.com/index.js.
-      const { author } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(author, 'Mike Reyes');
-    });
-
-    it('returns the date_published', async () => {
-      // To pass this test, fill out the date_published selector
-      // in ./src/extractors/custom/www.cinemablend.com/index.js.
-      const { date_published } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(date_published, '2016-12-22T02:46:48.000Z');
-    });
-
-    it('returns the lead_image_url', async () => {
-      // To pass this test, fill out the lead_image_url selector
-      // in ./src/extractors/custom/www.cinemablend.com/index.js.
-      const { lead_image_url } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        lead_image_url,
-        'http://img.cinemablend.com/quill/6/a/4/e/4/0/6a4e40e4aad46eb7b27810ce5a9fccff01c03c19.jpg'
-      );
-    });
-
-    it('returns the content', async () => {
-      // To pass this test, fill out the content selector
-      // in ./src/extractors/custom/www.cinemablend.com/index.js.
-      // You may also want to make use of the clean and transform
-      // options.
-      const { content } = await result;
-
-      const $ = cheerio.load(content || '');
-
-      const first13 = excerptContent(
-        $('*')
-          .first()
-          .text(),
-        13
-      );
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        first13,
-        'What happens when you put two beautiful people together in the black of'
-      );
-    });
-  });
-});
--- a/src/extractors/custom/www.howtogeek.com/index.js
+++ b/src/extractors/custom/www.howtogeek.com/index.js
@ -1,33 +0,0 @@
-export const WwwHowtogeekComExtractor = {
-  domain: 'www.howtogeek.com',
-
-  title: {
-    selectors: ['title'],
-  },
-
-  author: {
-    selectors: ['#authorinfobox a'],
-  },
-
-  date_published: {
-    selectors: ['#authorinfobox + div li'],
-    timezone: 'GMT',
-  },
-
-  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']],
-  },
-
-  content: {
-    selectors: ['.thecontent'],
-
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {},
-
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: [],
-  },
-};
--- a/src/extractors/custom/www.howtogeek.com/index.test.js
+++ b/src/extractors/custom/www.howtogeek.com/index.test.js
@ -1,102 +0,0 @@
-import assert from 'assert';
-import URL from 'url';
-import cheerio from 'cheerio';
-
-import Mercury from 'mercury';
-import getExtractor from 'extractors/get-extractor';
-import { excerptContent } from 'utils/text';
-
-const fs = require('fs');
-
-describe('WwwHowtogeekComExtractor', () => {
-  describe('initial test case', () => {
-    let result;
-    let url;
-    beforeAll(() => {
-      url =
-        'http://www.howtogeek.com/282568/amazon-echo-vs.-google-home-which-one-should-you-buy/';
-      const html = fs.readFileSync(
-        './fixtures/www.howtogeek.com/1482438125052.html'
-      );
-      result = Mercury.parse(url, { html, fallback: false });
-    });
-
-    it('is selected properly', () => {
-      // This test should be passing by default.
-      // It sanity checks that the correct parser
-      // is being selected for URLs from this domain
-      const extractor = getExtractor(url);
-      assert.equal(extractor.domain, URL.parse(url).hostname);
-    });
-
-    it('returns the title', async () => {
-      // To pass this test, fill out the title selector
-      // in ./src/extractors/custom/www.howtogeek.com/index.js.
-      const { title } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        title,
-        'Amazon Echo vs. Google Home: Which One Should You Buy?'
-      );
-    });
-
-    it('returns the author', async () => {
-      // To pass this test, fill out the author selector
-      // in ./src/extractors/custom/www.howtogeek.com/index.js.
-      const { author } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(author, 'Craig Lloyd');
-    });
-
-    it('returns the date_published', async () => {
-      // To pass this test, fill out the date_published selector
-      // in ./src/extractors/custom/www.rawstory.com/index.js.
-      const { date_published } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(date_published, '2016-12-10T00:00:00.000Z');
-    });
-
-    it('returns the lead_image_url', async () => {
-      // To pass this test, fill out the lead_image_url selector
-      // in ./src/extractors/custom/www.howtogeek.com/index.js.
-      const { lead_image_url } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        lead_image_url,
-        'http://www.howtogeek.com/thumbcache/280/160/2c80a158f6eb69883931148a7dc900a0/wp-content/uploads/2016/11/2016-11-29_0011-650x301.jpg'
-      );
-    });
-
-    it('returns the content', async () => {
-      // To pass this test, fill out the content selector
-      // in ./src/extractors/custom/www.howtogeek.com/index.js.
-      // You may also want to make use of the clean and transform
-      // options.
-      const { content } = await result;
-
-      const $ = cheerio.load(content || '');
-
-      const first13 = excerptContent(
-        $('*')
-          .first()
-          .text(),
-        13
-      );
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        first13,
-        'Amazon blew the industry wide open with its release of the Echo back'
-      );
-    });
-  });
-});
--- a/src/extractors/custom/www.nj.com/index.js
+++ b/src/extractors/custom/www.nj.com/index.js
@ -1,34 +0,0 @@
-export const WwwNjComExtractor = {
-  domain: 'www.nj.com',
-
-  title: {
-    selectors: [['meta[name="title"]', 'value']],
-  },
-
-  author: {
-    selectors: [['meta[name="article_author"]', 'value']],
-  },
-
-  date_published: {
-    selectors: [['meta[name="article_date_original"]', 'value']],
-
-    timezone: 'America/New_York',
-  },
-
-  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']],
-  },
-
-  content: {
-    selectors: ['.entry-content'],
-
-    // Is there anything in the content you selected that needs transformed
-    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {},
-
-    // Is there anything that is in the result that shouldn't be?
-    // The clean selectors will remove anything that matches from
-    // the result
-    clean: [],
-  },
-};
--- a/src/extractors/custom/www.nj.com/index.test.js
+++ b/src/extractors/custom/www.nj.com/index.test.js
@ -1,100 +0,0 @@
-import assert from 'assert';
-import URL from 'url';
-import cheerio from 'cheerio';
-
-import Mercury from 'mercury';
-import getExtractor from 'extractors/get-extractor';
-import { excerptContent } from 'utils/text';
-
-const fs = require('fs');
-
-describe('WwwNjComExtractor', () => {
-  describe('initial test case', () => {
-    let result;
-    let url;
-    beforeAll(() => {
-      url =
-        'http://www.nj.com/essex/index.ssf/2016/12/man_sentenced_for_stealing_millions_from_nj_atms_i.html#incart_river_home';
-      const html = fs.readFileSync('./fixtures/www.nj.com/1481666201503.html');
-      result = Mercury.parse(url, { html, fallback: false });
-    });
-
-    it('is selected properly', () => {
-      // This test should be passing by default.
-      // It sanity checks that the correct parser
-      // is being selected for URLs from this domain
-      const extractor = getExtractor(url);
-      assert.equal(extractor.domain, URL.parse(url).hostname);
-    });
-
-    it('returns the title', async () => {
-      // To pass this test, fill out the title selector
-      // in ./src/extractors/custom/www.nj.com/index.js.
-      const { title } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        title,
-        'Man sentenced for stealing millions in elaborate N.J. ATM skimming scheme'
-      );
-    });
-
-    it('returns the author', async () => {
-      // To pass this test, fill out the author selector
-      // in ./src/extractors/custom/www.nj.com/index.js.
-      const { author } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(author, 'Rajeev Dhir | NJ Advance Media for NJ.com');
-    });
-
-    it('returns the date_published', async () => {
-      // To pass this test, fill out the date_published selector
-      // in ./src/extractors/custom/www.nj.com/index.js.
-      const { date_published } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(date_published, '2016-12-13T21:51:00.000Z');
-    });
-
-    it('returns the lead_image_url', async () => {
-      // To pass this test, fill out the lead_image_url selector
-      // in ./src/extractors/custom/www.nj.com/index.js.
-      const { lead_image_url } = await result;
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        lead_image_url,
-        'http://image.nj.com/home/njo-media/width620/img/njcom_photos/photo/2016/12/08/21671718-large.png'
-      );
-    });
-
-    it('returns the content', async () => {
-      // To pass this test, fill out the content selector
-      // in ./src/extractors/custom/www.nj.com/index.js.
-      // You may also want to make use of the clean and transform
-      // options.
-      const { content } = await result;
-
-      const $ = cheerio.load(content || '');
-
-      const first13 = excerptContent(
-        $('*')
-          .first()
-          .text(),
-        13
-      );
-
-      // Update these values with the expected values from
-      // the article.
-      assert.equal(
-        first13,
-        'NEWARK -- A Romanian native was sentenced to 57 months on Tuesday for'
-      );
-    });
-  });
-});