fix: incorrect parsing on theatlantic.com (#475)

* fix: incorrect parsing on theatlantic.com * chore: updating theatlantic.com tests & fixtures * chore: removing script data from minified fixture
5 years ago · 0686ee7956
parent 5e33263d25
commit 0686ee7956
3 changed files with 33 additions and 12 deletions
--- a/fixtures/www.theatlantic.com/1474321707642.html
+++ b/fixtures/www.theatlantic.com/1474321707642.html
--- a/src/extractors/custom/www.theatlantic.com/index.js
+++ b/src/extractors/custom/www.theatlantic.com/index.js
@ -3,18 +3,15 @@
 export const TheAtlanticExtractor = {
  domain: 'www.theatlantic.com',
  title: {
-    selectors: ['h1.hed'],
+    selectors: ['h1', '.c-article-header__hed'],
  },

  author: {
-    selectors: ['article#article .article-cover-extra .metadata .byline a'],
+    selectors: [['meta[name="author"]', 'value'], '.c-byline__author'],
  },

  content: {
-    selectors: [
-      ['.article-cover figure.lead-img', '.article-body'],
-      '.article-body',
-    ],
+    selectors: ['article', '.article-body'],

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
@ -23,14 +20,29 @@ export const TheAtlanticExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['.partner-box', '.callout'],
+    clean: [
+      '.partner-box',
+      '.callout',
+      '.c-article-writer__image',
+      '.c-article-writer__content',
+      '.c-letters-cta__text',
+      '.c-footer__logo',
+      '.c-recirculation-link',
+      '.twitter-tweet',
+    ],
+  },
+
+  dek: {
+    selectors: [['meta[name="description"]', 'value']],
  },

  date_published: {
-    selectors: [['time[itemProp="datePublished"]', 'datetime']],
+    selectors: [['time[itemprop="datePublished"]', 'datetime']],
  },

-  lead_image_url: null,
+  lead_image_url: {
+    selectors: [['img[itemprop="url"]', 'src']],
+  },

  next_page_url: null,

--- a/src/extractors/custom/www.theatlantic.com/index.test.js
+++ b/src/extractors/custom/www.theatlantic.com/index.test.js
@ -35,7 +35,8 @@ describe('AtlanticExtractor', () => {
      // selectors in ./src/extractors/custom/www.theatlantic.com/index.js. This test is just
      // a stub; you can add more fields to test as much of
      // your parser as possible.
-      const { content, title, author } = await result;
+      const { content, title, author, dek, lead_image_url } = await result;
+
      const $ = cheerio.load(content);
      const text = $('*')
        .first()
@ -48,7 +49,15 @@ describe('AtlanticExtractor', () => {
        'Why New Yorkers Received a Push Alert About a Manhunt'
      );
      assert.equal(author, 'Kaveh Waddell');
-      assert.equal(text, 'New York police offi');
+      assert.equal(text, 'The city has never b');
+      assert.equal(
+        dek,
+        'The city has never before used the emergency system the way it did Monday morning.'
+      );
+      assert.equal(
+        lead_image_url,
+        'https://cdn.theatlantic.com/assets/media/img/mt/2016/09/RTSO9RP/lead_720_405.jpg?mod=1533691849'
+      );
    });
  });
 });