release: 2.2.3 (#703)

add-extension-instructions v2.2.3
John Holdun 2 years ago committed by GitHub
parent 635fcf6356
commit ad8d4aa268
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,12 @@
# Mercury Parser Changelog
### 2.2.3 (Oct 24, 2022)
- [[`635fcf6356`](https://github.com/postlight/parser/commit/635fcf6356)] - **fix**: handle sec & ms timestamps properly (#702) (Austin)
- [[`ab401822aa`](https://github.com/postlight/parser/commit/ab401822aa)] - maintenance update - october 2022 (#696) (Michael Ashley)
- [[`8ca8a5f7e5`](https://github.com/postlight/parser/commit/8ca8a5f7e5)] - **feat**: add postlight.com custom extractor (#695) (Sarah Doire)
- [[`39b9ff55c4`](https://github.com/postlight/parser/commit/39b9ff55c4)] - **release**: 2.2.2 (#689) (John Holdun)
### 2.2.2 (Sept 08, 2022)
##### Commits

64
dist/mercury.js vendored

@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
// the src attribute so the images are no longer lazy loaded.
function convertLazyLoadedImages($) {
var extractSrcFromJSON = function extractSrcFromJSON(str) {
try {
var _JSON$parse = JSON.parse(str),
src = _JSON$parse.src;
if (typeof src === 'string') return src;
} catch (e) {
return false;
}
return false;
};
$('img').each(function (_, img) {
var attrs = getAttrs(img);
@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
$(img).attr('srcset', value);
} else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
$(img).attr('src', value);
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
var existingSrc = extractSrcFromJSON(value);
if (existingSrc) {
$(img).attr('src', existingSrc);
} else {
$(img).attr('src', value);
}
}
});
});
@ -2388,6 +2408,14 @@ var MediumExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
// Allow drop cap character.
'section span:first-of-type': function sectionSpanFirstOfType($node) {
var $text = $node.html();
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
$node.replaceWith($text);
}
},
// Re-write lazy-loaded youtube videos
iframe: function iframe($node) {
var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@ -2429,7 +2457,7 @@ var MediumExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['span', 'svg']
clean: ['span a', 'svg']
},
date_published: {
selectors: [['meta[name="article:published_time"]', 'value']]
@ -6411,10 +6439,14 @@ function cleanDatePublished(dateString) {
format = _ref.format;
// If string is in milliseconds or seconds, convert to int and return
if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
if (MS_DATE_STRING.test(dateString)) {
return new Date(_parseInt(dateString, 10)).toISOString();
}
if (SEC_DATE_STRING.test(dateString)) {
return new Date(_parseInt(dateString, 10) * 1000).toISOString();
}
var date = createDate(dateString, timezone, format);
if (!date.isValid()) {
@ -7546,13 +7578,26 @@ var GenericExcerptExtractor = {
}
};
var getWordCount = function getWordCount(content) {
var $ = cheerio.load(content);
var $content = $('div').first();
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
};
var getWordCountAlt = function getWordCountAlt(content) {
content = content.replace(/<[^>]*>/g, ' ');
content = content.replace(/\s+/g, ' ');
content = content.trim();
return content.split(' ').length;
};
var GenericWordCountExtractor = {
extract: function extract(_ref) {
var content = _ref.content;
var $ = cheerio.load(content);
var $content = $('div').first();
var text = normalizeSpaces($content.text());
return text.split(/\s/).length;
var count = getWordCount(content);
if (count === 1) count = getWordCountAlt(content);
return count;
}
};
@ -7715,7 +7760,8 @@ function select(opts) {
_extractionOpts$defau = extractionOpts.defaultCleaner,
defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
allowMultiple = extractionOpts.allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple);
var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
if (!matchingSelector) return null;
function transformAndClean($node) {
@ -7988,7 +8034,7 @@ function _collectAllPages() {
});
return _context.abrupt("return", _objectSpread({}, result, {
total_pages: pages,
pages_rendered: pages,
rendered_pages: pages,
word_count: word_count
}));

File diff suppressed because one or more lines are too long

@ -1,6 +1,6 @@
{
"name": "@postlight/parser",
"version": "2.2.2",
"version": "2.2.3",
"description": "Postlight Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
"author": "Postlight <mercury@postlight.com>",
"homepage": "https://reader.postlight.com",

Loading…
Cancel
Save