feat: ability to add custom extractors via api (#484)

* feat: ability to add custom extractors via api

* docs: updating readme

* fix: example.com was being used in another test

* fix: timezone was messing up date_published test

* fix: using a unique site for testing

* fix: updated custom extractor api

* docs: updating readme

* fix: removing unused fixture

* fix: updating test description

* feat: ability to add custom extractors via cli
dx-upgrade-rollup-and-eslint
Michael Ashley 5 years ago committed by GitHub
parent f95947fe88
commit e12c916499
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -137,6 +137,9 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --ext
# Get the value of attributes by adding a pipe to --extend or --extend-list
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href"
# Pass optional --add-extractor argument to add a custom extractor at runtime.
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
```
## License

@ -14,8 +14,17 @@ const {
l,
header,
h,
addExtractor,
x,
} = argv;
(async (urlToParse, contentType, extendedTypes, extendedListTypes, headers) => {
(async (
urlToParse,
contentType,
extendedTypes,
extendedListTypes,
headers,
addExtractor
) => {
if (!urlToParse) {
console.log(
'\n\
@ -23,7 +32,7 @@ mercury-parser\n\n\
The Mercury Parser extracts semantic content from any url\n\n\
Usage:\n\
\n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... \n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\
\n\
'
);
@ -37,6 +46,7 @@ Usage:\n\
text: 'text',
txt: 'text',
};
const extensions = {};
[].concat(extendedTypes || []).forEach(t => {
const [name, selector] = t.split('=');
@ -53,10 +63,18 @@ Usage:\n\
allowMultiple: true,
};
});
// Attempt to load custom extractor from path.
let customExtractor;
if (addExtractor) {
customExtractor = require(addExtractor);
}
const result = await Mercury.parse(urlToParse, {
contentType: contentTypeMap[contentType],
extend: extensions,
headers,
customExtractor,
});
console.log(JSON.stringify(result, null, 2));
} catch (e) {
@ -75,4 +93,11 @@ Usage:\n\
console.error(`\n${reportBug}\n`);
process.exit(1);
}
})(url, format || f, extend || e, extendList || l, header || h);
})(
url,
format || f,
extend || e,
extendList || l,
header || h,
addExtractor || x
);

File diff suppressed because it is too large Load Diff

@ -0,0 +1,16 @@
import mergeSupportedDomains from '../utils/merge-supported-domains';
export const apiExtractors = {};
export default function addExtractor(extractor) {
if (!extractor || !extractor.domain) {
return {
error: true,
message: 'Unable to add custom extractor. Invalid parameters.',
};
}
Object.assign(apiExtractors, mergeSupportedDomains(extractor));
return apiExtractors;
}

@ -0,0 +1,22 @@
import assert from 'assert';
import addExtractor from './add-extractor';
describe('addExtractor(extractor)', () => {
it('can add multiple custom extractors', () => {
addExtractor({ domain: 'www.site1.com' });
addExtractor({ domain: 'www.site2.com' });
const result = addExtractor({ domain: 'www.site3.com' });
assert.equal(Object.keys(result).length, 3);
});
it('returns error if an extractor is not provided', () => {
const result = addExtractor();
assert.equal(result.error, true);
});
it('returns error if a domain key is not included within the custom extractor', () => {
const result = addExtractor({ test: 'abc' });
assert.equal(result.error, true);
});
});

@ -349,3 +349,62 @@ This script will open both an `html` and `json` file allowing you to preview you
If you've written a custom extractor, please send us a pull request! Passing tests that demonstrate your parser in action will help us evaluate the parser.
Sometimes you may find that the site you're parsing doesn't provide certain information. For example, some sites don't have deks, and in those instances, you don't need to write a selector for that field. If there's a test for a selector you don't need, you can just remove that test and make note of it in your pull request.
---
## Adding Custom Extractor via API
As of **version 2.1.1**, you can additionally add custom private extractors via API. Make sure that your custom extractor includes a domain name. Note that extractors added via API will take precedence over the packaged custom extractors.
```javascript
const customExtractor = {
domain: 'www.sandiegouniontribune.com',
title: {
selectors: ['h1', '.ArticlePage-headline'],
},
author: {
selectors: ['.ArticlePage-authorInfo-bio-name'],
},
content: {
selectors: ['article'],
},
};
Mercury.addExtractor(customExtractor);
```
---
## Passing custom extractor to addExtractor via CLI
It's also possible to add a custom parser at runtime via the CLI.
### 1. Create your custom extractor in a standalone file.
```javascript
var customExtractor = {
domain: 'postlight.com',
title: {
selectors: ['h1'],
},
author: {
selectors: ['.byline-name'],
},
content: {
selectors: ['article'],
},
extend: {
uniqueKeyFromFixture: {
selectors: ['.single__hero-category'],
},
},
};
module.exports = customExtractor;
```
### 2. From the CLI, add the `--add-extractor` param:
```bash
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js
```

@ -0,0 +1,19 @@
var customExtractor = {
domain: 'postlight.com',
title: {
selectors: ['h1'],
},
author: {
selectors: ['.byline-name'],
},
content: {
selectors: ['article'],
},
extend: {
uniqueKeyFromFixture: {
selectors: ['.single__hero-category'],
},
},
};
module.exports = customExtractor;

@ -3,6 +3,7 @@ import URL from 'url';
import Extractors from './all';
import GenericExtractor from './generic';
import detectByHtml from './detect-by-html';
import { apiExtractors } from './add-extractor';
export default function getExtractor(url, parsedUrl, $) {
parsedUrl = parsedUrl || URL.parse(url);
@ -13,6 +14,8 @@ export default function getExtractor(url, parsedUrl, $) {
.join('.');
return (
apiExtractors[hostname] ||
apiExtractors[baseDomain] ||
Extractors[hostname] ||
Extractors[baseDomain] ||
detectByHtml($) ||

@ -4,6 +4,7 @@ import TurndownService from 'turndown';
import Resource from 'resource';
import { validateUrl } from 'utils';
import addCustomExtractor from 'extractors/add-extractor';
import getExtractor from 'extractors/get-extractor';
import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor';
import collectAllPages from 'extractors/collect-all-pages';
@ -16,6 +17,7 @@ const Mercury = {
contentType = 'html',
headers = {},
extend,
customExtractor,
} = opts;
// if no url was passed and this is the browser version,
@ -43,6 +45,11 @@ const Mercury = {
return $;
}
// Add custom extractor via cli.
if (customExtractor) {
addCustomExtractor(customExtractor);
}
const Extractor = getExtractor(url, parsedUrl, $);
// console.log(`Using extractor for ${Extractor.domain}`);
@ -112,6 +119,10 @@ const Mercury = {
fetchResource(url) {
return Resource.create(url);
},
addExtractor(extractor) {
return addCustomExtractor(extractor);
},
};
export default Mercury;

@ -182,4 +182,40 @@ describe('Mercury', () => {
assert.equal(sites.length, 8);
assert.equal(sites[1], 'http://nymag.com/daily/intelligencer/');
});
it('is able to use custom extractors (with extension) added via api', async () => {
const url =
'https://www.sandiegouniontribune.com/business/growth-development/story/2019-08-27/sdsu-mission-valley-stadium-management-firm';
const html = fs.readFileSync(
'./fixtures/sandiegouniontribune.com/test.html',
'utf8'
);
const customExtractor = {
domain: 'www.sandiegouniontribune.com',
title: {
selectors: ['h1', '.ArticlePage-headline'],
},
author: {
selectors: ['.ArticlePage-authorInfo-bio-name'],
},
content: {
selectors: ['article'],
},
extend: {
testContent: {
selectors: ['.ArticlePage-breadcrumbs a'],
},
},
};
Mercury.addExtractor(customExtractor);
const result = await Mercury.parse(url, { html });
assert.equal(typeof result, 'object');
assert.equal(result.author, 'Jennifer Van Grove');
assert.equal(result.domain, 'www.sandiegouniontribune.com');
assert.equal(result.total_pages, 1);
assert.equal(result.testContent, 'Growth & Development');
});
});

Loading…
Cancel
Save