feat: Support passing custom headers in requests (#337)

pull/340/head
Toufic Mouallem 5 years ago committed by GitHub
parent 3ed778b53e
commit 144a797564
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -80,6 +80,20 @@ This returns the the page's `content` as GitHub-flavored Markdown:
"content": "...**Thunder** is the [stage name](https://en.wikipedia.org/wiki/Stage_name) for the..."
```
##### Custom Request Headers
You can include custom headers in requests by passing name-value pairs to the `parse` function as follows:
```javascript
Mercury.parse(url, {
headers: {
Cookie: 'name=value; name2=value2; name3=value3',
'User-Agent':
'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1',
},
}).then(result => console.log(result));
```
##### Pre-fetched HTML
You can use Mercury Parser to parse custom or pre-fetched HTML by passing an HTML string to the `parse` function as follows:
@ -112,6 +126,9 @@ mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source
# Pass optional --format argument to set content type (html|markdown|text)
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown
# Pass optional --header.name=value arguments to include custom headers in the request
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --header.Cookie="name=value; name2=value2; name3=value3" --header.User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1"
# Pass optional --extend-list argument to add a custom type to the response
mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em"

@ -12,8 +12,10 @@ const {
e,
extendList,
l,
header,
h,
} = argv;
(async (urlToParse, contentType, extendedTypes, extendedListTypes) => {
(async (urlToParse, contentType, extendedTypes, extendedListTypes, headers) => {
if (!urlToParse) {
console.log(
'\n\
@ -21,7 +23,7 @@ mercury-parser\n\n\
The Mercury Parser extracts semantic content from any url\n\n\
Usage:\n\
\n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--extend type=selector]... [--extend-list type=selector]... \n\
$ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... \n\
\n\
'
);
@ -54,6 +56,7 @@ Usage:\n\
const result = await Mercury.parse(urlToParse, {
contentType: contentTypeMap[contentType],
extend: extensions,
headers,
});
console.log(JSON.stringify(result, null, 2));
} catch (e) {
@ -72,4 +75,4 @@ Usage:\n\
console.error(`\n${reportBug}\n`);
process.exit(1);
}
})(url, format || f, extend || e, extendList || l);
})(url, format || f, extend || e, extendList || l, header || h);

167
dist/mercury.js vendored

@ -194,56 +194,6 @@ function getEncoding(str) {
return encoding;
}
var _marked =
/*#__PURE__*/
_regeneratorRuntime.mark(range);
function range() {
var start,
end,
_args = arguments;
return _regeneratorRuntime.wrap(function range$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;
case 2:
if (!(start <= end)) {
_context.next = 7;
break;
}
_context.next = 5;
return start += 1;
case 5:
_context.next = 2;
break;
case 7:
case "end":
return _context.stop();
}
}
}, _marked, this);
}
// extremely simple url validation as a first step
function validateUrl(_ref) {
var hostname = _ref.hostname;
// If this isn't a valid url, return an error message
return !!hostname;
}
var Errors = {
badUrl: {
error: true,
messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.'
}
};
var REQUEST_HEADERS = cheerio.browser ? {} : {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'
}; // The number of milliseconds to attempt to fetch a resource before timing out.
@ -320,16 +270,22 @@ function _fetchResource() {
_fetchResource = _asyncToGenerator(
/*#__PURE__*/
_regeneratorRuntime.mark(function _callee(url, parsedUrl) {
var options, _ref2, response, body;
var headers,
options,
_ref2,
response,
body,
_args = arguments;
return _regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
headers = _args.length > 2 && _args[2] !== undefined ? _args[2] : {};
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
options = _objectSpread({
url: parsedUrl.href,
headers: _objectSpread({}, REQUEST_HEADERS),
headers: _objectSpread({}, REQUEST_HEADERS, headers),
timeout: FETCH_TIMEOUT,
// Accept cookies
jar: true,
@ -344,31 +300,34 @@ function _fetchResource() {
// Follow GET redirects; this option is for Node only
followRedirect: true
});
_context.next = 4;
_context.next = 5;
return get(options);
case 4:
case 5:
_ref2 = _context.sent;
response = _ref2.response;
body = _ref2.body;
_context.prev = 7;
_context.prev = 8;
validateResponse(response);
return _context.abrupt("return", {
body: body,
response: response
});
case 12:
_context.prev = 12;
_context.t0 = _context["catch"](7);
return _context.abrupt("return", Errors.badUrl);
case 13:
_context.prev = 13;
_context.t0 = _context["catch"](8);
return _context.abrupt("return", {
error: true,
message: _context.t0.message
});
case 15:
case 16:
case "end":
return _context.stop();
}
}
}, _callee, this, [[7, 12]]);
}, _callee, this, [[8, 13]]);
}));
return _fetchResource.apply(this, arguments);
}
@ -1616,17 +1575,23 @@ var Resource = {
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
// :param headers: Custom headers to be included in the request
create: function () {
var _create = _asyncToGenerator(
/*#__PURE__*/
_regeneratorRuntime.mark(function _callee(url, preparedResponse, parsedUrl) {
var result, validResponse;
var headers,
result,
validResponse,
_args = arguments;
return _regeneratorRuntime.wrap(function _callee$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
headers = _args.length > 3 && _args[3] !== undefined ? _args[3] : {};
if (!preparedResponse) {
_context.next = 5;
_context.next = 6;
break;
}
@ -1642,29 +1607,29 @@ var Resource = {
body: preparedResponse,
response: validResponse
};
_context.next = 8;
_context.next = 9;
break;
case 5:
_context.next = 7;
return fetchResource(url, parsedUrl);
case 6:
_context.next = 8;
return fetchResource(url, parsedUrl, headers);
case 7:
case 8:
result = _context.sent;
case 8:
case 9:
if (!result.error) {
_context.next = 11;
_context.next = 12;
break;
}
result.failed = true;
return _context.abrupt("return", result);
case 11:
case 12:
return _context.abrupt("return", this.generateDoc(result));
case 12:
case 13:
case "end":
return _context.stop();
}
@ -1722,6 +1687,49 @@ var Resource = {
}
};
var _marked =
/*#__PURE__*/
_regeneratorRuntime.mark(range);
function range() {
var start,
end,
_args = arguments;
return _regeneratorRuntime.wrap(function range$(_context) {
while (1) {
switch (_context.prev = _context.next) {
case 0:
start = _args.length > 0 && _args[0] !== undefined ? _args[0] : 1;
end = _args.length > 1 && _args[1] !== undefined ? _args[1] : 1;
case 2:
if (!(start <= end)) {
_context.next = 7;
break;
}
_context.next = 5;
return start += 1;
case 5:
_context.next = 2;
break;
case 7:
case "end":
return _context.stop();
}
}
}, _marked, this);
}
// extremely simple url validation as a first step
function validateUrl(_ref) {
var hostname = _ref.hostname;
// If this isn't a valid url, return an error message
return !!hostname;
}
var merge = function merge(extractor, domains) {
return domains.reduce(function (acc, domain) {
acc[domain] = extractor;
@ -3432,7 +3440,7 @@ var WwwNydailynewsComExtractor = {
var WwwCnbcComExtractor = {
domain: 'www.cnbc.com',
title: {
selectors: ['h1.title']
selectors: ['h1.title', 'h1.ArticleHeader-headline']
},
author: {
selectors: [['meta[name="author"]', 'value']]
@ -3444,7 +3452,7 @@ var WwwCnbcComExtractor = {
selectors: [['meta[name="og:image"]', 'value']]
},
content: {
selectors: ['div#article_body.content', 'div.story'],
selectors: ['div#article_body.content', 'div.story', 'div.ArticleBody-articleBody'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {},
@ -6623,6 +6631,8 @@ var Mercury = {
fallback,
_opts$contentType,
contentType,
_opts$headers,
headers,
extend,
parsedUrl,
$,
@ -6641,7 +6651,7 @@ var Mercury = {
switch (_context.prev = _context.next) {
case 0:
_ref = _args.length > 1 && _args[1] !== undefined ? _args[1] : {}, html = _ref.html, opts = _objectWithoutProperties(_ref, ["html"]);
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, extend = opts.extend; // if no url was passed and this is the browser version,
_opts$fetchAllPages = opts.fetchAllPages, fetchAllPages = _opts$fetchAllPages === void 0 ? true : _opts$fetchAllPages, _opts$fallback = opts.fallback, fallback = _opts$fallback === void 0 ? true : _opts$fallback, _opts$contentType = opts.contentType, contentType = _opts$contentType === void 0 ? 'html' : _opts$contentType, _opts$headers = opts.headers, headers = _opts$headers === void 0 ? {} : _opts$headers, extend = opts.extend; // if no url was passed and this is the browser version,
// set url to window.location.href and load the html
// from the current page
@ -6658,11 +6668,14 @@ var Mercury = {
break;
}
return _context.abrupt("return", Errors.badUrl);
return _context.abrupt("return", {
error: true,
message: 'The url parameter passed does not look like a valid URL. Please check your URL and try again.'
});
case 6:
_context.next = 8;
return Resource.create(url, html, parsedUrl);
return Resource.create(url, html, parsedUrl, headers);
case 8:
$ = _context.sent;

@ -14,6 +14,7 @@ const Mercury = {
fetchAllPages = true,
fallback = true,
contentType = 'html',
headers = {},
extend,
} = opts;
@ -35,7 +36,7 @@ const Mercury = {
};
}
const $ = await Resource.create(url, html, parsedUrl);
const $ = await Resource.create(url, html, parsedUrl, headers);
// If we found an error creating the resource, return that error
if ($.failed) {

@ -12,7 +12,8 @@ const Resource = {
// :param response: If set, use as the response rather than
// attempting to fetch it ourselves. Expects a
// string.
async create(url, preparedResponse, parsedUrl) {
// :param headers: Custom headers to be included in the request
async create(url, preparedResponse, parsedUrl, headers = {}) {
let result;
if (preparedResponse) {
@ -27,7 +28,7 @@ const Resource = {
result = { body: preparedResponse, response: validResponse };
} else {
result = await fetchResource(url, parsedUrl);
result = await fetchResource(url, parsedUrl, headers);
}
if (result.error) {

@ -86,11 +86,11 @@ export function baseDomain({ host }) {
// TODO: Ensure we are not fetching something enormous. Always return
// unicode content for HTML, with charset conversion.
export default async function fetchResource(url, parsedUrl) {
export default async function fetchResource(url, parsedUrl, headers = {}) {
parsedUrl = parsedUrl || URL.parse(encodeURI(url));
const options = {
url: parsedUrl.href,
headers: { ...REQUEST_HEADERS },
headers: { ...REQUEST_HEADERS, ...headers },
timeout: FETCH_TIMEOUT,
// Accept cookies
jar: true,

@ -17,6 +17,22 @@ describe('fetchResource(url)', () => {
assert.equal(error, true);
});
it('passes custom headers in requests', async () => {
// A GET request to this endpoint returns the list of all request headers as part of the response JSON
const url = 'https://postman-echo.com/headers';
const parsedUrl = URL.parse(url);
const headers = {
'my-custom-header': 'Lorem ipsum dolor sit amet',
};
const result = await fetchResource(url, parsedUrl, headers);
const body = JSON.parse(result.body.toString());
assert.equal(
body.headers['my-custom-header'],
'Lorem ipsum dolor sit amet'
);
});
it('returns a buffer as its body', async () => {
const url =
'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0';

Loading…
Cancel
Save