feat: ability to add custom extractors via api (#484)

* feat: ability to add custom extractors via api * docs: updating readme * fix: example.com was being used in another test * fix: timezone was messing up date_published test * fix: using a unique site for testing * fix: updated custom extractor api * docs: updating readme * fix: removing unused fixture * fix: updating test description * feat: ability to add custom extractors via cli
5 years ago · e12c916499
parent f95947fe88
commit e12c916499
10 changed files with 3186 additions and 3 deletions
--- a/README.md
+++ b/README.md
@ -137,6 +137,9 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --ext

 # Get the value of attributes by adding a pipe to --extend or --extend-list
 mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
+
+# Pass optional --add-extractor argument to add a custom extractor at runtime.
+mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
 ```

 ## License
--- a/cli.js
+++ b/cli.js
@ -14,8 +14,17 @@ const {
  l,
  header,
  h,
+  addExtractor,
+  x,
 } = argv;
-(async (urlToParse, contentType, extendedTypes, extendedListTypes, headers) => {
+(async (
+  urlToParse,
+  contentType,
+  extendedTypes,
+  extendedListTypes,
+  headers,
+  addExtractor
+) => {
  if (!urlToParse) {
    console.log(
      '\n\
@ -23,7 +32,7 @@ mercury-parser\n\n\
    The Mercury Parser extracts semantic content from any url\n\n\
 Usage:\n\
 \n\
-    $ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... \n\
+    $ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\
 \n\
 '
    );
@ -37,6 +46,7 @@ Usage:\n\
      text: 'text',
      txt: 'text',
    };
+
    const extensions = {};
    [].concat(extendedTypes || []).forEach(t => {
      const [name, selector] = t.split('=');
@ -53,10 +63,18 @@ Usage:\n\
        allowMultiple: true,
      };
    });
+
+    // Attempt to load custom extractor from path.
+    let customExtractor;
+    if (addExtractor) {
+      customExtractor = require(addExtractor);
+    }
+
    const result = await Mercury.parse(urlToParse, {
      contentType: contentTypeMap[contentType],
      extend: extensions,
      headers,
+      customExtractor,
    });
    console.log(JSON.stringify(result, null, 2));
  } catch (e) {
@ -75,4 +93,11 @@ Usage:\n\
    console.error(`\n${reportBug}\n`);
    process.exit(1);
  }
-})(url, format || f, extend || e, extendList || l, header || h);
+})(
+  url,
+  format || f,
+  extend || e,
+  extendList || l,
+  header || h,
+  addExtractor || x
+);
--- a/fixtures/sandiegouniontribune.com/test.html
+++ b/fixtures/sandiegouniontribune.com/test.html
--- a/src/extractors/add-extractor.js
+++ b/src/extractors/add-extractor.js
@ -0,0 +1,16 @@
+import mergeSupportedDomains from '../utils/merge-supported-domains';
+
+export const apiExtractors = {};
+
+export default function addExtractor(extractor) {
+  if (!extractor || !extractor.domain) {
+    return {
+      error: true,
+      message: 'Unable to add custom extractor. Invalid parameters.',
+    };
+  }
+
+  Object.assign(apiExtractors, mergeSupportedDomains(extractor));
+
+  return apiExtractors;
+}
--- a/src/extractors/add-extractor.test.js
+++ b/src/extractors/add-extractor.test.js
@ -0,0 +1,22 @@
+import assert from 'assert';
+
+import addExtractor from './add-extractor';
+
+describe('addExtractor(extractor)', () => {
+  it('can add multiple custom extractors', () => {
+    addExtractor({ domain: 'www.site1.com' });
+    addExtractor({ domain: 'www.site2.com' });
+    const result = addExtractor({ domain: 'www.site3.com' });
+    assert.equal(Object.keys(result).length, 3);
+  });
+
+  it('returns error if an extractor is not provided', () => {
+    const result = addExtractor();
+    assert.equal(result.error, true);
+  });
+
+  it('returns error if a domain key is not included within the custom extractor', () => {
+    const result = addExtractor({ test: 'abc' });
+    assert.equal(result.error, true);
+  });
+});
--- a/src/extractors/custom/README.md
+++ b/src/extractors/custom/README.md
@ -349,3 +349,62 @@ This script will open both an `html` and `json` file allowing you to preview you
 If you've written a custom extractor, please send us a pull request! Passing tests that demonstrate your parser in action will help us evaluate the parser.

 Sometimes you may find that the site you're parsing doesn't provide certain information. For example, some sites don't have deks, and in those instances, you don't need to write a selector for that field. If there's a test for a selector you don't need, you can just remove that test and make note of it in your pull request.
+
+---
+
+## Adding Custom Extractor via API
+
+As of **version 2.1.1**, you can additionally add custom private extractors via API. Make sure that your custom extractor includes a domain name. Note that extractors added via API will take precedence over the packaged custom extractors.
+
+```javascript
+const customExtractor = {
+  domain: 'www.sandiegouniontribune.com',
+  title: {
+    selectors: ['h1', '.ArticlePage-headline'],
+  },
+  author: {
+    selectors: ['.ArticlePage-authorInfo-bio-name'],
+  },
+  content: {
+    selectors: ['article'],
+  },
+};
+
+Mercury.addExtractor(customExtractor);
+```
+
+---
+
+## Passing custom extractor to addExtractor via CLI
+
+It's also possible to add a custom parser at runtime via the CLI.
+
+### 1. Create your custom extractor in a standalone file.
+
+```javascript
+var customExtractor = {
+  domain: 'postlight.com',
+  title: {
+    selectors: ['h1'],
+  },
+  author: {
+    selectors: ['.byline-name'],
+  },
+  content: {
+    selectors: ['article'],
+  },
+  extend: {
+    uniqueKeyFromFixture: {
+      selectors: ['.single__hero-category'],
+    },
+  },
+};
+
+module.exports = customExtractor;
+```
+
+### 2. From the CLI, add the `--add-extractor` param:
+
+```bash
+mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
+```
--- a/src/extractors/fixtures/postlight.com/index.js
+++ b/src/extractors/fixtures/postlight.com/index.js
@ -0,0 +1,19 @@
+var customExtractor = {
+  domain: 'postlight.com',
+  title: {
+    selectors: ['h1'],
+  },
+  author: {
+    selectors: ['.byline-name'],
+  },
+  content: {
+    selectors: ['article'],
+  },
+  extend: {
+    uniqueKeyFromFixture: {
+      selectors: ['.single__hero-category'],
+    },
+  },
+};
+
+module.exports = customExtractor;
--- a/src/extractors/get-extractor.js
+++ b/src/extractors/get-extractor.js
@ -3,6 +3,7 @@ import URL from 'url';
 import Extractors from './all';
 import GenericExtractor from './generic';
 import detectByHtml from './detect-by-html';
+import { apiExtractors } from './add-extractor';

 export default function getExtractor(url, parsedUrl, $) {
  parsedUrl = parsedUrl || URL.parse(url);
@ -13,6 +14,8 @@ export default function getExtractor(url, parsedUrl, $) {
    .join('.');

  return (
+    apiExtractors[hostname] ||
+    apiExtractors[baseDomain] ||
    Extractors[hostname] ||
    Extractors[baseDomain] ||
    detectByHtml($) ||
--- a/src/mercury.js
+++ b/src/mercury.js
@ -4,6 +4,7 @@ import TurndownService from 'turndown';

 import Resource from 'resource';
 import { validateUrl } from 'utils';
+import addCustomExtractor from 'extractors/add-extractor';
 import getExtractor from 'extractors/get-extractor';
 import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
 import collectAllPages from 'extractors/collect-all-pages';
@ -16,6 +17,7 @@ const Mercury = {
      contentType = 'html',
      headers = {},
      extend,
+      customExtractor,
    } = opts;

    // if no url was passed and this is the browser version,
@ -43,6 +45,11 @@ const Mercury = {
      return $;
    }

+    // Add custom extractor via cli.
+    if (customExtractor) {
+      addCustomExtractor(customExtractor);
+    }
+
    const Extractor = getExtractor(url, parsedUrl, $);
    // console.log(`Using extractor for ${Extractor.domain}`);

@ -112,6 +119,10 @@ const Mercury = {
  fetchResource(url) {
    return Resource.create(url);
  },
+
+  addExtractor(extractor) {
+    return addCustomExtractor(extractor);
+  },
 };

 export default Mercury;
--- a/src/mercury.test.js
+++ b/src/mercury.test.js
@ -182,4 +182,40 @@ describe('Mercury', () => {
    assert.equal(sites.length, 8);
    assert.equal(sites[1], 'http://nymag.com/daily/intelligencer/');
  });
+
+  it('is able to use custom extractors (with extension) added via api', async () => {
+    const url =
+      'https://www.sandiegouniontribune.com/business/growth-development/story/2019-08-27/sdsu-mission-valley-stadium-management-firm';
+    const html = fs.readFileSync(
+      './fixtures/sandiegouniontribune.com/test.html',
+      'utf8'
+    );
+
+    const customExtractor = {
+      domain: 'www.sandiegouniontribune.com',
+      title: {
+        selectors: ['h1', '.ArticlePage-headline'],
+      },
+      author: {
+        selectors: ['.ArticlePage-authorInfo-bio-name'],
+      },
+      content: {
+        selectors: ['article'],
+      },
+      extend: {
+        testContent: {
+          selectors: ['.ArticlePage-breadcrumbs a'],
+        },
+      },
+    };
+
+    Mercury.addExtractor(customExtractor);
+
+    const result = await Mercury.parse(url, { html });
+    assert.equal(typeof result, 'object');
+    assert.equal(result.author, 'Jennifer Van Grove');
+    assert.equal(result.domain, 'www.sandiegouniontribune.com');
+    assert.equal(result.total_pages, 1);
+    assert.equal(result.testContent, 'Growth & Development');
+  });
 });