2019-02-05 20:14:38 +00:00
|
|
|
#!/usr/bin/env node
|
2019-02-05 22:53:22 +00:00
|
|
|
/* eslint-disable */
|
2019-02-05 20:14:38 +00:00
|
|
|
|
|
|
|
const Mercury = require('./dist/mercury');
|
2019-02-08 00:48:13 +00:00
|
|
|
const argv = require('yargs-parser')(process.argv.slice(2));
|
2019-02-05 20:14:38 +00:00
|
|
|
|
2019-02-08 00:48:13 +00:00
|
|
|
const {
|
|
|
|
_: [url],
|
|
|
|
format,
|
|
|
|
f,
|
2019-03-25 22:36:21 +00:00
|
|
|
extend,
|
|
|
|
e,
|
|
|
|
extendList,
|
|
|
|
l,
|
2019-03-26 11:48:41 +00:00
|
|
|
header,
|
|
|
|
h,
|
2019-02-08 00:48:13 +00:00
|
|
|
} = argv;
|
2019-03-26 11:48:41 +00:00
|
|
|
(async (urlToParse, contentType, extendedTypes, extendedListTypes, headers) => {
|
2019-02-05 20:14:38 +00:00
|
|
|
if (!urlToParse) {
|
|
|
|
console.log(
|
|
|
|
'\n\
|
|
|
|
mercury-parser\n\n\
|
|
|
|
The Mercury Parser extracts semantic content from any url\n\n\
|
|
|
|
Usage:\n\
|
|
|
|
\n\
|
2019-03-26 11:48:41 +00:00
|
|
|
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... \n\
|
2019-02-05 20:14:38 +00:00
|
|
|
\n\
|
|
|
|
'
|
|
|
|
);
|
|
|
|
return;
|
|
|
|
}
|
2019-02-06 17:34:22 +00:00
|
|
|
try {
|
2019-02-08 00:48:13 +00:00
|
|
|
const contentTypeMap = {
|
|
|
|
html: 'html',
|
|
|
|
markdown: 'markdown',
|
|
|
|
md: 'markdown',
|
|
|
|
text: 'text',
|
|
|
|
txt: 'text',
|
|
|
|
};
|
2019-03-25 22:36:21 +00:00
|
|
|
const extensions = {};
|
|
|
|
[].concat(extendedTypes || []).forEach(t => {
|
|
|
|
const [name, selector] = t.split('=');
|
|
|
|
const fullSelector =
|
|
|
|
selector.indexOf('|') > 0 ? selector.split('|') : selector;
|
|
|
|
extensions[name] = { selectors: [fullSelector] };
|
|
|
|
});
|
|
|
|
[].concat(extendedListTypes || []).forEach(t => {
|
|
|
|
const [name, selector] = t.split('=');
|
|
|
|
const fullSelector =
|
|
|
|
selector.indexOf('|') > 0 ? selector.split('|') : selector;
|
|
|
|
extensions[name] = {
|
|
|
|
selectors: [fullSelector],
|
|
|
|
allowMultiple: true,
|
|
|
|
};
|
|
|
|
});
|
2019-02-08 01:03:42 +00:00
|
|
|
const result = await Mercury.parse(urlToParse, {
|
2019-02-08 00:48:13 +00:00
|
|
|
contentType: contentTypeMap[contentType],
|
2019-03-25 22:36:21 +00:00
|
|
|
extend: extensions,
|
2019-03-26 11:48:41 +00:00
|
|
|
headers,
|
2019-02-08 00:48:13 +00:00
|
|
|
});
|
2019-02-06 17:34:22 +00:00
|
|
|
console.log(JSON.stringify(result, null, 2));
|
|
|
|
} catch (e) {
|
|
|
|
if (e.message === 'ETIMEDOUT' && false) {
|
|
|
|
console.error(
|
|
|
|
'\nMercury Parser encountered a timeout trying to load that resource.'
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
console.error(
|
|
|
|
'\nMercury Parser encountered a problem trying to parse that resource.\n'
|
|
|
|
);
|
|
|
|
console.error(e);
|
|
|
|
}
|
|
|
|
const reportBug =
|
|
|
|
'If you believe this was an error, please file an issue at:\n\n https://github.com/postlight/mercury-parser/issues/new';
|
|
|
|
console.error(`\n${reportBug}\n`);
|
|
|
|
process.exit(1);
|
|
|
|
}
|
2019-03-26 11:48:41 +00:00
|
|
|
})(url, format || f, extend || e, extendList || l, header || h);
|