feat: custom genius parser. (#284)

also adds ability to transform value returned by an attribute selector
5 years ago · eacd1ee97f
parent c389c966d7
commit eacd1ee97f
6 changed files with 1435 additions and 5 deletions
--- a/fixtures/genius.com/1550609084053.html
+++ b/fixtures/genius.com/1550609084053.html
--- a/src/extractors/custom/README.md
+++ b/src/extractors/custom/README.md
@ -328,6 +328,8 @@ As [explained above](#selecting-an-attribute), to return an attribute rather tha
  ...
 ```

+In rare circumstances, you may want to manipulate the result of the attribute value. In these cases, you can add a third element to the selector array above — a function that will take the value of the attribute and return a value you've transformed it to. E.g., imagine that you want to access a JSON value that's been stringified into an attribute. Your function could take the stringified JSON, parse it, and return just the piece of it you want.
+
 You can refer to the [NewYorkerExtractor](www.newyorker.com/index.js) to see more the rest of the basic selectors.

 ### Step 4: Content extraction
--- a/src/extractors/custom/genius.com/index.js
+++ b/src/extractors/custom/genius.com/index.js
@ -0,0 +1,56 @@
+export const GeniusComExtractor = {
+  domain: 'genius.com',
+
+  title: {
+    selectors: ['h1'],
+  },
+
+  author: {
+    selectors: ['h2 a'],
+  },
+
+  date_published: {
+    selectors: [
+      [
+        'meta[itemprop=page_data]',
+        'value',
+        res => {
+          const json = JSON.parse(res);
+          return json.song.release_date;
+        },
+      ],
+    ],
+  },
+
+  dek: {
+    selectors: [
+      // enter selectors
+    ],
+  },
+
+  lead_image_url: {
+    selectors: [
+      [
+        'meta[itemprop=page_data]',
+        'value',
+        res => {
+          const json = JSON.parse(res);
+          return json.song.album.cover_art_url;
+        },
+      ],
+    ],
+  },
+
+  content: {
+    selectors: ['.lyrics'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: [],
+  },
+};
--- a/src/extractors/custom/genius.com/index.test.js
+++ b/src/extractors/custom/genius.com/index.test.js
@ -0,0 +1,98 @@
+import assert from 'assert';
+import URL from 'url';
+import cheerio from 'cheerio';
+import moment from 'moment';
+
+import Mercury from 'mercury';
+import getExtractor from 'extractors/get-extractor';
+import { excerptContent } from 'utils/text';
+
+const fs = require('fs');
+
+describe('GeniusComExtractor', () => {
+  describe('initial test case', () => {
+    let result;
+    let url;
+    beforeAll(() => {
+      url = 'https://genius.com/Prince-and-the-revolution-purple-rain-lyrics';
+      const html = fs.readFileSync('./fixtures/genius.com/1550609084053.html');
+      result = Mercury.parse(url, { html, fallback: false });
+    });
+
+    it('is selected properly', () => {
+      // This test should be passing by default.
+      // It sanity checks that the correct parser
+      // is being selected for URLs from this domain
+      const extractor = getExtractor(url);
+      assert.equal(extractor.domain, URL.parse(url).hostname);
+    });
+
+    it('returns the title', async () => {
+      // To pass this test, fill out the title selector
+      // in ./src/extractors/custom/genius.com/index.js.
+      const { title } = await result;
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(title, `Purple Rain`);
+    });
+
+    it('returns the author', async () => {
+      // To pass this test, fill out the author selector
+      // in ./src/extractors/custom/genius.com/index.js.
+      const { author } = await result;
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(author, 'Prince and the Revolution');
+    });
+
+    it('returns the date_published', async () => {
+      // To pass this test, fill out the date_published selector
+      // in ./src/extractors/custom/genius.com/index.js.
+      const { date_published } = await result;
+      const newDatePublished = moment(date_published).format();
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(newDatePublished.split('T')[0], '1984-06-25');
+    });
+
+    it('returns the lead_image_url', async () => {
+      // To pass this test, fill out the lead_image_url selector
+      // in ./src/extractors/custom/genius.com/index.js.
+      const { lead_image_url } = await result;
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(
+        lead_image_url,
+        `https://images.genius.com/da3381a38218928924c94db9ea59543b.1000x1000x1.jpg`
+      );
+    });
+
+    it('returns the content', async () => {
+      // To pass this test, fill out the content selector
+      // in ./src/extractors/custom/genius.com/index.js.
+      // You may also want to make use of the clean and transform
+      // options.
+      const { content } = await result;
+
+      const $ = cheerio.load(content || '');
+
+      const first13 = excerptContent(
+        $('*')
+          .first()
+          .text(),
+        13
+      );
+
+      // Update these values with the expected values from
+      // the article.
+      assert.equal(
+        first13,
+        '[Verse 1] I never meant to cause you any sorrow I never meant'
+      );
+    });
+  });
+});
--- a/src/extractors/custom/index.js
+++ b/src/extractors/custom/index.js
@ -103,3 +103,4 @@ export * from './www.sanwa.co.jp';
 export * from './www.elecom.co.jp';
 export * from './scan.netsecurity.ne.jp';
 export * from './jvndb.jvn.jp';
+export * from './genius.com';
--- a/src/extractors/root-extractor.js
+++ b/src/extractors/root-extractor.js
@ -140,14 +140,15 @@ export function select(opts) {
  // if selector is an array (e.g., ['img', 'src']),
  // extract the attr
  if (Array.isArray(matchingSelector)) {
-    const [selector, attr] = matchingSelector;
+    const [selector, attr, transform] = matchingSelector;
    $match = $(selector);
    $match = transformAndClean($match);
-    result = $match.map((_, el) =>
-      $(el)
+    result = $match.map((_, el) => {
+      const item = $(el)
        .attr(attr)
-        .trim()
-    );
+        .trim();
+      return transform ? transform(item) : item;
+    });
  } else {
    $match = $(matchingSelector);
    $match = transformAndClean($match);