* feat: extract custom types with extend option
Adds an `extend` option that lets you add custom types to be extracted
and returned alongside the defaults, either in a call to `parse()` or in
a custom extractor.
```
Mercury.parse(
url,
extend: {
last_edited: { selectors: ['#last-edited'], defaultCleaner: false }
}
)
```
* chore: use Reflect.ownKeys
* feat: add CLI options
* doc: add extend param to cli help
* refactor: extract selectExtendedTypes
* feat: only overwrite null extended results
* feat: add allowMultiple extraction option
* feat: accept extendList CLI args
* feat: allow attribute selectors in extends on CLI
* test: update extend tests
* fix: don't invoke cleaner for custom types
* feat: always return array if allowMultiple
* test: add test for array of single result
* refactor: extract extractHtml
* refactor: destructure allowMultiple
* fix: wrap multiple matches in $ for cheerio shim
* fix: find extended types before any other munging
* feat: absolutize all links
* fix: clean content more directly
* doc: Update CLI docs in README
* chore: update dist
* doc: Document extend in custom extractor README
@ -67,7 +67,9 @@ If Mercury is unable to find a field, that field will return `null`.
By default, Mercury Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example:
@ -400,10 +400,10 @@ var SPACER_RE = new RegExp('transparent|spacer|blank', 'i'); // The class we wil
// but would normally remove
varKEEP_CLASS='mercury-parser-keep';
varKEEP_SELECTORS=['iframe[src^="https://www.youtube.com"]','iframe[src^="https://www.youtube-nocookie.com"]','iframe[src^="http://www.youtube.com"]','iframe[src^="https://player.vimeo"]','iframe[src^="http://player.vimeo"]'];// A list of tags to strip from the output if we encounter them.
varKEEP_SELECTORS=['iframe[src^="https://www.youtube.com"]','iframe[src^="https://www.youtube-nocookie.com"]','iframe[src^="http://www.youtube.com"]','iframe[src^="https://player.vimeo"]','iframe[src^="http://player.vimeo"]', 'iframe[src^="https://www.redditmedia.com"]'];// A list of tags to strip from the output if we encounter them.
_opts$fetchAllPages=opts.fetchAllPages,fetchAllPages=_opts$fetchAllPages===void0?true:_opts$fetchAllPages,_opts$fallback=opts.fallback,fallback=_opts$fallback===void0?true:_opts$fallback,_opts$contentType=opts.contentType,contentType=_opts$contentType===void0?'html':_opts$contentType; // if no url was passed and this is the browser version,
_opts$fetchAllPages=opts.fetchAllPages,fetchAllPages=_opts$fetchAllPages===void0?true:_opts$fetchAllPages,_opts$fallback=opts.fallback,fallback=_opts$fallback===void0?true:_opts$fallback,_opts$contentType=opts.contentType,contentType=_opts$contentType===void0?'html':_opts$contentType, extend=opts.extend; // if no url was passed and this is the browser version,
// set url to window.location.href and load the html
// from the current page
@ -6455,19 +6666,19 @@ var Mercury = {
case8:
$=_context.sent;
Extractor=getExtractor(url,parsedUrl,$);// console.log(`Using extractor for ${Extractor.domain}`);
// If we found an error creating the resource, return that error
if(!$.failed){
_context.next=12;
_context.next=11;
break;
}
return_context.abrupt("return",$);
case12:
case11:
Extractor=getExtractor(url,parsedUrl,$);// console.log(`Using extractor for ${Extractor.domain}`);
// if html still has not been set (i.e., url passed to Mercury.parse),
// set html from the response of Resource.create
if(!html){
html=$.html();
}// Cached value of every meta name in our document.
@ -6477,6 +6688,16 @@ var Mercury = {
metaCache=$('meta').map(function(_,node){
return$(node).attr('name');
}).toArray();
extendedTypes={};
if(extend){
extendedTypes=selectExtendedTypes(extend,{
$:$,
url:url,
html:html
});
}
result=RootExtractor.extract(Extractor,{
url:url,
html:html,
@ -6489,11 +6710,11 @@ var Mercury = {
_result=result,title=_result.title,next_page_url=_result.next_page_url;// Fetch more pages if next_page_url found
To add a custom key to the response, add an `extend` object. The response will include
results for each key of this object (`categories` in the example below). Setting
`allowMultiple` to `true` means Mercury will find all the content that matches the
selectors, and will always return an array of results for that key.
```javascript
export const ExampleExtractorWithExtend = {
...
extend: {
categories: {
selectors: ['.post-taglist a'],
allowMultiple: true,
}
},
...
```
### Cleaning content from an article
An article's content can be more complex than the other fields, meaning you sometimes need to do more than just provide the selector(s) in order to return clean content.