feat: update more parsers and add correct bloomberg html files

2 years ago · 7b75d04501
parent 8543db6017
commit 7b75d04501
21 changed files with 10158 additions and 331 deletions
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -2205,14 +2205,14 @@ var WikiaExtractor = {
 var LittleThingsExtractor = {
  domain: 'www.littlethings.com',
  title: {
-    selectors: ['h1.post-title']
+    selectors: ['h1[class*="PostHeader"]', 'h1.post-title']
  },
  author: {
-    selectors: [['meta[name="author"]', 'value']]
+    selectors: ['div[class^="PostHeader__ScAuthorNameSection"]', ['meta[name="author"]', 'value']]
  },
  content: {
    selectors: [// enter content selectors
-    '.mainContentIntro', '.content-wrapper'],
+    'section[class*="PostMainArticle"]', '.mainContentIntro', '.content-wrapper'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
@ -2238,11 +2238,11 @@ var PoliticoExtractor = {
    ['meta[name="og:title"]', 'value']]
  },
  author: {
-    selectors: ['.story-main-content .byline .vcard']
+    selectors: ['.story-meta__authors .vcard', '.story-main-content .byline .vcard']
  },
  content: {
    selectors: [// enter content selectors
-    '.story-main-content', '.content-group', '.story-core', '.story-text'],
+    ['p.story-text__paragraph   '], '.story-main-content', '.content-group', '.story-core', '.story-text'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: [],
@ -2252,7 +2252,7 @@ var PoliticoExtractor = {
    clean: ['figcaption']
  },
  date_published: {
-    selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']]
+    selectors: ['.story-meta__details time[datetime]', ['.story-main-content .timestamp time[datetime]', 'datetime']]
  },
  lead_image_url: {
    selectors: [// enter lead_image_url selectors
@ -2815,9 +2815,7 @@ var WwwTheguardianComExtractor = {
    selectors: ['#maincontent', '.content__article-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {
-      'h2': 'h4'
-    },
+    transforms: {},
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
@ -2860,24 +2858,24 @@ var WwwBloombergComExtractor = {
    selectors: [// normal articles
    '.lede-headline', // /graphics/ template
    'h1.article-title', // /news/ template
-    'h1.lede-text-only__hed']
+    'h1[class^="headline"]', 'h1.lede-text-only__hed']
  },
  author: {
    selectors: [['meta[name="parsely-author"]', 'value'], '.byline-details__link', // /graphics/ template
    '.bydek', // /news/ template
-    '.author']
+    '.author', 'p[class*="author"]']
  },
  date_published: {
-    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value']]
+    selectors: [['time.published-at', 'datetime'], ['time[datetime]', 'datetime'], ['meta[name="date"]', 'value'], ['meta[name="parsely-pub-date"]', 'value'], ['meta[name="parsely-pub-date"]', 'content']]
  },
  dek: {
    selectors: []
  },
  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']]
+    selectors: [['meta[name="og:image"]', 'value'], ['meta[name="og:image"]', 'content']]
  },
  content: {
-    selectors: ['.article-body__content', // /graphics/ template
+    selectors: ['.article-body__content', '.body-content', // /graphics/ template
    ['section.copy-block'], // /news/ template
    '.body-copy'],
    // Is there anything in the content you selected that needs transformed
@ -3459,21 +3457,22 @@ var WwwMentalflossComExtractor = {
 var AbcnewsGoComExtractor = {
  domain: 'abcnews.go.com',
  title: {
-    selectors: ['.article-header h1']
+    selectors: ['div[class*="Article_main__body"] h1', '.article-header h1']
  },
  author: {
-    selectors: ['.authors'],
+    selectors: ['.ShareByline span:nth-child(2)', '.authors'],
    clean: ['.author-overlay', '.by-text']
  },
  date_published: {
-    selectors: ['.timestamp'],
+    selectors: ['.ShareByline', '.timestamp'],
+    format: 'MMMM D, YYYY h:mm a',
    timezone: 'America/New_York'
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
-    selectors: ['.article-copy'],
+    selectors: ['article', '.article-copy'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
@ -3735,19 +3734,19 @@ var UproxxComExtractor = {
 var WwwEonlineComExtractor = {
  domain: 'www.eonline.com',
  title: {
-    selectors: ['h1.article__title']
+    selectors: ['h1.article-detail__title', 'h1.article__title']
  },
  author: {
-    selectors: ['.entry-meta__author a']
+    selectors: ['.article-detail__meta__author', '.entry-meta__author a']
  },
  date_published: {
-    selectors: [['meta[itemprop="datePublished"]', 'value']]
+    selectors: [['meta[name="article:published_time"]', 'value'], ['meta[itemprop="datePublished"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
-    selectors: [['.post-content section, .post-content div.post-content__image']],
+    selectors: [['.article-detail__main-content section'], ['.post-content section, .post-content div.post-content__image']],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
@ -4155,19 +4154,19 @@ var ScienceflyComExtractor = {
 var HellogigglesComExtractor = {
  domain: 'hellogiggles.com',
  title: {
-    selectors: ['.title']
+    selectors: [['meta[name="og:title"]', 'value'], '.title']
  },
  author: {
-    selectors: ['.author-link']
+    selectors: ['.byline-wrapper span.author_name', '.author-link']
  },
  date_published: {
-    selectors: [['meta[name="article:published_time"]', 'value']]
+    selectors: [['meta[property="article:published_time"]', 'content'], ['meta[name="article:published_time"]', 'value']]
  },
  lead_image_url: {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
-    selectors: ['.entry-content'],
+    selectors: ['.main-content', '.entry-content'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {},
@ -4472,10 +4471,11 @@ var GothamistComExtractor = {
    selectors: ['h1', '.entry-header h1']
  },
  author: {
-    selectors: ['.author']
+    // There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
+    selectors: ['.article-metadata:nth-child(3) .byline-author', '.author']
  },
  date_published: {
-    selectors: ['abbr', 'abbr.published'],
+    selectors: [['meta[name="article:published_time"]', 'value'], 'abbr', 'abbr.published'],
    timezone: 'America/New_York'
  },
  dek: {
@ -4485,7 +4485,7 @@ var GothamistComExtractor = {
    selectors: [['meta[name="og:image"]', 'value']]
  },
  content: {
-    selectors: ['.entry-body'],
+    selectors: ['.article-body', '.entry-body'],
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
--- a/fixtures/abcnews.go.com.html
+++ b/fixtures/abcnews.go.com.html
--- a/fixtures/gothamist.com.html
+++ b/fixtures/gothamist.com.html
--- a/fixtures/www.bloomberg.com--graphics.html
+++ b/fixtures/www.bloomberg.com--graphics.html
--- a/fixtures/www.bloomberg.com--news.html
+++ b/fixtures/www.bloomberg.com--news.html
--- a/fixtures/www.bloomberg.com.html
+++ b/fixtures/www.bloomberg.com.html
--- a/fixtures/www.eonline.com.html
+++ b/fixtures/www.eonline.com.html
--- a/fixtures/www.littlethings.com.html
+++ b/fixtures/www.littlethings.com.html
--- a/fixtures/www.qdaily.com.html
+++ b/fixtures/www.qdaily.com.html
--- a/src/extractors/custom/abcnews.go.com/index.js
+++ b/src/extractors/custom/abcnews.go.com/index.js
@ -2,16 +2,17 @@ export const AbcnewsGoComExtractor = {
  domain: 'abcnews.go.com',

  title: {
-    selectors: ['.article-header h1'],
+    selectors: ['div[class*="Article_main__body"] h1', '.article-header h1'],
  },

  author: {
-    selectors: ['.authors'],
+    selectors: ['.ShareByline span:nth-child(2)', '.authors'],
    clean: ['.author-overlay', '.by-text'],
  },

  date_published: {
-    selectors: ['.timestamp'],
+    selectors: ['.ShareByline', '.timestamp'],
+    format: 'MMMM D, YYYY h:mm a',
    timezone: 'America/New_York',
  },

@ -20,7 +21,7 @@ export const AbcnewsGoComExtractor = {
  },

  content: {
-    selectors: ['.article-copy'],
+    selectors: ['article', '.article-copy'],

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
--- a/src/extractors/custom/abcnews.go.com/index.test.js
+++ b/src/extractors/custom/abcnews.go.com/index.test.js
@ -36,7 +36,7 @@ describe('AbcnewsGoComExtractor', () => {
      // the article.
      assert.equal(
        title,
-        "Hillary Clinton: Putin's Alleged Involvement in Democratic Hack Stems From Longtime Grudge"
+        "Hillary Clinton: Putin's Alleged Involvement in Democratic Hack Stems From 'Personal Beef'"
      );
    });

@ -47,7 +47,7 @@ describe('AbcnewsGoComExtractor', () => {

      // Update these values with the expected values from
      // the article.
-      assert.equal(author, 'Josh Haskell David Caplan PATRICK REEVELL');
+      assert.equal(author, 'JOSH HASKELL, DAVID CAPLAN and PATRICK REEVELL');
    });

    it('returns the date_published', async () => {
@ -57,7 +57,7 @@ describe('AbcnewsGoComExtractor', () => {

      // Update these values with the expected values from
      // the article.
-      assert.equal(date_published, '2016-12-16T17:37:00.000Z');
+      assert.equal(date_published, '2016-12-16T21:19:00.000Z');
    });

    it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('AbcnewsGoComExtractor', () => {
      // the article.
      assert.equal(
        lead_image_url,
-        'http://a.abcnews.com/images/Politics/AP-hillary-clinton-01-as-161216_16x9_992.jpg'
+        'https://s.abcnews.com/images/Politics/AP-hillary-clinton-01-as-161216_16x9_992.jpg'
      );
    });

@ -93,7 +93,7 @@ describe('AbcnewsGoComExtractor', () => {
      // the article.
      assert.equal(
        first13,
-        "Hillary Clinton has an explanation for Vladimir Putin's alleged involvement in the hacking"
+        "&#151; -- Hillary Clinton has an explanation for Vladimir Putin's alleged involvement in"
      );
    });
  });
--- a/src/extractors/custom/gothamist.com/index.js
+++ b/src/extractors/custom/gothamist.com/index.js
@ -14,11 +14,16 @@ export const GothamistComExtractor = {
  },

  author: {
-    selectors: ['.author'],
+    // There are multiple article-metadata and byline-author classes, but the main article's is the 3rd child of the l-container class
+    selectors: ['.article-metadata:nth-child(3) .byline-author', '.author'],
  },

  date_published: {
-    selectors: ['abbr', 'abbr.published'],
+    selectors: [
+      ['meta[name="article:published_time"]', 'value'],
+      'abbr',
+      'abbr.published',
+    ],

    timezone: 'America/New_York',
  },
@ -32,7 +37,7 @@ export const GothamistComExtractor = {
  },

  content: {
-    selectors: ['.entry-body'],
+    selectors: ['.article-body', '.entry-body'],

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
--- a/src/extractors/custom/gothamist.com/index.test.js
+++ b/src/extractors/custom/gothamist.com/index.test.js
@ -44,9 +44,11 @@ describe('GothamistComExtractor', () => {
      // in ./src/extractors/custom/gothamist.com/index.js.
      const { author } = await result;

+      console.log(author.toString());
+
      // Update these values with the expected values from
      // the article.
-      assert.equal(author, 'Nathan Tempey');
+      assert.equal(author, 'Nathan\xa0Tempey');
    });

    it('returns the date_published', async () => {
@ -78,7 +80,7 @@ describe('GothamistComExtractor', () => {
      // the article.
      assert.equal(
        lead_image_url,
-        'http://gothamist.com/assets_c/2017/03/030717FalloutShelter80NY-5-thumb-640xauto-989222.jpg'
+        'https://cms.prod.nypr.digital/images/42584/fill-1200x650/'
      );
    });

@ -102,7 +104,7 @@ describe('GothamistComExtractor', () => {
      // the article.
      assert.equal(
        first13,
-        'The basement at 80 New York Avenue in Crown Heights is one of'
+        "You've seen the placards around town, beside the front door on apartment buildings,"
      );
    });
  });
--- a/src/extractors/custom/www.bloomberg.com/index.js
+++ b/src/extractors/custom/www.bloomberg.com/index.js
@ -10,6 +10,7 @@ export const WwwBloombergComExtractor = {
      'h1.article-title',

      // /news/ template
+      'h1[class^="headline"]',
      'h1.lede-text-only__hed',
    ],
  },
@ -24,6 +25,7 @@ export const WwwBloombergComExtractor = {

      // /news/ template
      '.author',
+      'p[class*="author"]',
    ],
  },

@ -33,6 +35,7 @@ export const WwwBloombergComExtractor = {
      ['time[datetime]', 'datetime'],
      ['meta[name="date"]', 'value'],
      ['meta[name="parsely-pub-date"]', 'value'],
+      ['meta[name="parsely-pub-date"]', 'content'],
    ],
  },

@ -41,12 +44,16 @@ export const WwwBloombergComExtractor = {
  },

  lead_image_url: {
-    selectors: [['meta[name="og:image"]', 'value']],
+    selectors: [
+      ['meta[name="og:image"]', 'value'],
+      ['meta[name="og:image"]', 'content'],
+    ],
  },

  content: {
    selectors: [
      '.article-body__content',
+      '.body-content',

      // /graphics/ template
      ['section.copy-block'],
--- a/src/extractors/custom/www.bloomberg.com/index.test.js
+++ b/src/extractors/custom/www.bloomberg.com/index.test.js
@ -69,7 +69,7 @@ describe('WwwBloombergComExtractor', () => {
      // the article.
      assert.equal(
        lead_image_url,
-        'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/-1x-1.jpg'
+        'https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ioUAfA1V2nzk/v0/1200x675.jpg'
      );
    });

--- a/src/extractors/custom/www.eonline.com/index.js
+++ b/src/extractors/custom/www.eonline.com/index.js
@ -2,15 +2,18 @@ export const WwwEonlineComExtractor = {
  domain: 'www.eonline.com',

  title: {
-    selectors: ['h1.article__title'],
+    selectors: ['h1.article-detail__title', 'h1.article__title'],
  },

  author: {
-    selectors: ['.entry-meta__author a'],
+    selectors: ['.article-detail__meta__author', '.entry-meta__author a'],
  },

  date_published: {
-    selectors: [['meta[itemprop="datePublished"]', 'value']],
+    selectors: [
+      ['meta[name="article:published_time"]', 'value'],
+      ['meta[itemprop="datePublished"]', 'value'],
+    ],
  },

  lead_image_url: {
@ -19,6 +22,7 @@ export const WwwEonlineComExtractor = {

  content: {
    selectors: [
+      ['.article-detail__main-content section'],
      ['.post-content section, .post-content div.post-content__image'],
    ],

--- a/src/extractors/custom/www.eonline.com/index.test.js
+++ b/src/extractors/custom/www.eonline.com/index.test.js
@ -57,7 +57,7 @@ describe('WwwEonlineComExtractor', () => {

      // Update these values with the expected values from
      // the article.
-      assert.equal(date_published, '2016-12-12T06:00:00.000Z');
+      assert.equal(date_published, '2016-12-12T14:00:00.000Z');
    });

    it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('WwwEonlineComExtractor', () => {
      // the article.
      assert.equal(
        lead_image_url,
-        'http://akns-images.eonline.com/eol_images/Entire_Site/2016117/rs_300x300-161207101544-600.ryan-gosling-gq.12716.jpg?downsize=600:*&crop=600:315;left,top'
+        'https://akns-images.eonline.com/eol_images/Entire_Site/2016117/rs_600x600-161207101544-600.ryan-gosling-gq.12716.jpg?fit=around%7C1080:1080&output-quality=90&crop=1080:1080;center,top'
      );
    });

@ -93,7 +93,7 @@ describe('WwwEonlineComExtractor', () => {
      // the article.
      assert.equal(
        first13,
-        "Ryan Gosling's most cherished role won't win him any Hollywood awards.With his musical"
+        "Ryan Gosling's most cherished role won't win him any Hollywood awards. With his"
      );
    });
  });
--- a/src/extractors/custom/www.littlethings.com/index.js
+++ b/src/extractors/custom/www.littlethings.com/index.js
@ -5,6 +5,7 @@ export const LittleThingsExtractor = {
  domain: 'www.littlethings.com',
  title: {
    selectors: [
+      'h1[class*="PostHeader"]',
      'h1.post-title',
      // enter title selectors
    ],
@ -12,6 +13,7 @@ export const LittleThingsExtractor = {

  author: {
    selectors: [
+      'div[class^="PostHeader__ScAuthorNameSection"]',
      ['meta[name="author"]', 'value'],
      // enter author selectors
    ],
@ -20,6 +22,7 @@ export const LittleThingsExtractor = {
  content: {
    selectors: [
      // enter content selectors
+      'section[class*="PostMainArticle"]',
      '.mainContentIntro',
      '.content-wrapper',
    ],
--- a/src/extractors/custom/www.littlethings.com/index.test.js
+++ b/src/extractors/custom/www.littlethings.com/index.test.js
@ -14,7 +14,8 @@ describe('LittleThingsExtractor', () => {
    let result;
    let url;
    beforeAll(() => {
-      url = 'http://www.littlethings.com/diy-pineapple-lamp/';
+      url =
+        'https://www.littlethings.com/lifestyle/amazon-has-a-private-food-brand-that-just-launched-100-new-products-for-fall/';
      const html = fs.readFileSync('./fixtures/www.littlethings.com.html');
      result = Mercury.parse(url, { html, fallback: false });
    });
@ -38,7 +39,7 @@ describe('LittleThingsExtractor', () => {
      // the article.
      assert.equal(
        title,
-        'Snip The Stems Off Plastic Spoons To Make A Quirky Pineapple Lamp'
+        'Amazon Has A Private Food Brand That Just Launched 100 New Products For Fall'
      );
    });

@ -49,7 +50,7 @@ describe('LittleThingsExtractor', () => {

      // Update these values with the expected values from
      // the article.
-      assert.equal(author, 'Laura Caseley');
+      assert.equal(author, 'Bethany Braun-Silva');
    });

    it('returns the lead_image_url', async () => {
@ -61,7 +62,7 @@ describe('LittleThingsExtractor', () => {
      // the article.
      assert.equal(
        lead_image_url,
-        'http://cdn1.littlethings.com/app/uploads/2016/09/pineapple-b-thumb-1.jpg'
+        'https://images.ctfassets.net/f60q1anpxzid/6qQL1s7jrdjipeebu7wkNB/80ba587f55cc9f3673582f493dc611f1/04764069-41b3-48d3-be5a-4cde1c41.jpeg?w=1800&q=50&fm=jpg&fl=progressive'
      );
    });

@ -85,7 +86,7 @@ describe('LittleThingsExtractor', () => {
      // the article.
      assert.equal(
        first13,
-        'Every room needs light, and so lamps are pretty much a necessity for'
+        "One of the best things about fall is all the food offerings. What's"
      );
    });
  });
--- a/src/extractors/custom/www.politico.com/index.js
+++ b/src/extractors/custom/www.politico.com/index.js
@ -11,12 +11,16 @@ export const PoliticoExtractor = {
  },

  author: {
-    selectors: ['.story-main-content .byline .vcard'],
+    selectors: [
+      '.story-meta__authors .vcard',
+      '.story-main-content .byline .vcard',
+    ],
  },

  content: {
    selectors: [
      // enter content selectors
+      ['p.story-text__paragraph   '],
      '.story-main-content',
      '.content-group',
      '.story-core',
@ -34,7 +38,10 @@ export const PoliticoExtractor = {
  },

  date_published: {
-    selectors: [['.story-main-content .timestamp time[datetime]', 'datetime']],
+    selectors: [
+      '.story-meta__details time[datetime]',
+      ['.story-main-content .timestamp time[datetime]', 'datetime'],
+    ],
  },

  lead_image_url: {
--- a/src/extractors/custom/www.politico.com/index.test.js
+++ b/src/extractors/custom/www.politico.com/index.test.js
@ -57,7 +57,7 @@ describe('PoliticoExtractor', () => {

      // Update these values with the expected values from
      // the article.
-      assert.equal(date_published, '2016-10-04T09:07:00.000Z');
+      assert.equal(date_published, '2016-10-04T10:07:00.000Z');
    });

    it('returns the lead_image_url', async () => {
@ -69,7 +69,7 @@ describe('PoliticoExtractor', () => {
      // the article.
      assert.equal(
        lead_image_url,
-        'http://static.politico.com/0f/e7/5ee9a89044d1a01f74140bcd5b9e/caucus-vp-preview.jpg'
+        'https://static.politico.com/0f/e7/5ee9a89044d1a01f74140bcd5b9e/caucus-vp-preview.jpg'
      );
    });