You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/utils/cheerio-query.js

127 lines
3.2 KiB
JavaScript

// This module attempts to square cheerio with jquery
// so that node-specific quirks/features of cheerio
// will also work in the browser. This mostly involves
// shimming a few functions and rewriting the jquery
// constructor so it sandboxes most of its operations
// and doesn't mutate existing dom elements in the page.
import jQuery from 'jquery';
const PARSER_CLASS = 'mercury-parsing-container';
jQuery.noConflict();
const $ = (selector, context, rootjQuery, contextOverride = true) => {
if (contextOverride) {
if (context && typeof context === 'string') {
context = `.${PARSER_CLASS} ${context}`;
} else if (!context) {
context = `.${PARSER_CLASS}`;
}
}
return new jQuery.fn.init(selector, context, rootjQuery); // eslint-disable-line new-cap
};
$.fn = $.prototype = jQuery.fn;
jQuery.extend($, jQuery); // copy's trim, extend etc to $
const removeScripts = ($node) => {
// remove scripts and stylesheets
$node.find('script, style, link[rel="stylesheet"]').remove();
return $node;
};
$.cloneHtml = () => {
const html = removeScripts($('html', null, null, false).clone());
return html.children().wrap('<div />').wrap('<div />');
};
$.root = () => $('*').first();
$.browser = true;
const isContainer = ($node) => {
const el = $node.get(0);
if (el && el.tagName) {
return el.tagName.toLowerCase() === 'container';
}
return false;
};
$.html = ($node) => {
if ($node) {
// we never want to return a parsing container, only its children
if (isContainer($node) || isContainer($node.children('container'))) {
return $node.children('container').html() || $node.html();
}
return $('<div>').append($node.eq(0).clone()).html();
}
const $body = removeScripts($('body', null, null, false).clone());
const $head = removeScripts($('head', null, null, false).clone());
const $parsingNode = $body.find(`.${PARSER_CLASS}`);
if ($parsingNode.length > 0) {
return $parsingNode.children().html();
}
const html = $('<container />')
.append($(`<container>${$head.html()}</container>`))
.append($(`<container>${$body.html()}</container>`))
.wrap('<container />')
.parent()
.html();
return html;
};
$.cleanup = () => {
$(`.${PARSER_CLASS}`, null, null, false).remove();
};
$.load = (html, opts = {}, returnHtml = false) => {
const { normalizeWhitespace } = opts;
if (!html) {
html = $.cloneHtml();
} else {
if (normalizeWhitespace) {
if (typeof html === 'string') {
html = html.replace(/[\s\n\r]+/g, ' ');
}
}
html = $('<container />').html(html);
}
const $body = $('body', null, null, false);
// $('script', null, null, false).remove()
let $parsingNode = $body.find(`.${PARSER_CLASS}`);
if (!$parsingNode[0]) {
$body.append(`<div class="${PARSER_CLASS}" style="display: none;" />`);
$parsingNode = $body.find(`.${PARSER_CLASS}`);
}
// Strip scripts
html = removeScripts(html);
// Remove comments
html.find('*').contents().each(function () {
if (this.nodeType === Node.COMMENT_NODE) { // eslint-disable-line no-undef
$(this).remove();
}
});
$parsingNode.html(html);
if (returnHtml) return { $, html: html.html() };
return $;
};
export default $;