release: 2.2.3 (#703)

2 years ago · ad8d4aa268
parent 635fcf6356
commit ad8d4aa268
4 changed files with 64 additions and 11 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,12 @@
 # Mercury Parser Changelog

+### 2.2.3 (Oct 24, 2022)
+
+- [[`635fcf6356`](https://github.com/postlight/parser/commit/635fcf6356)] - **fix**: handle sec & ms timestamps properly (#702) (Austin)
+- [[`ab401822aa`](https://github.com/postlight/parser/commit/ab401822aa)] - maintenance update - october 2022 (#696) (Michael Ashley)
+- [[`8ca8a5f7e5`](https://github.com/postlight/parser/commit/8ca8a5f7e5)] - **feat**: add postlight.com custom extractor (#695) (Sarah Doire)
+- [[`39b9ff55c4`](https://github.com/postlight/parser/commit/39b9ff55c4)] - **release**: 2.2.2 (#689) (John Holdun)
+
 ### 2.2.2 (Sept 08, 2022)

 ##### Commits
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -1540,6 +1540,19 @@ var TAGS_TO_REMOVE = ['script', 'style', 'form'].join(',');
 // the src attribute so the images are no longer lazy loaded.

 function convertLazyLoadedImages($) {
+  var extractSrcFromJSON = function extractSrcFromJSON(str) {
+    try {
+      var _JSON$parse = JSON.parse(str),
+          src = _JSON$parse.src;
+
+      if (typeof src === 'string') return src;
+    } catch (e) {
+      return false;
+    }
+
+    return false;
+  };
+
  $('img').each(function (_, img) {
    var attrs = getAttrs(img);

@ -1549,7 +1562,14 @@ function convertLazyLoadedImages($) {
      if (attr !== 'srcset' && IS_LINK.test(value) && IS_SRCSET.test(value)) {
        $(img).attr('srcset', value);
      } else if (attr !== 'src' && attr !== 'srcset' && IS_LINK.test(value) && IS_IMAGE.test(value)) {
-        $(img).attr('src', value);
+        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
+        var existingSrc = extractSrcFromJSON(value);
+
+        if (existingSrc) {
+          $(img).attr('src', existingSrc);
+        } else {
+          $(img).attr('src', value);
+        }
      }
    });
  });
@ -2388,6 +2408,14 @@ var MediumExtractor = {
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
+      // Allow drop cap character.
+      'section span:first-of-type': function sectionSpanFirstOfType($node) {
+        var $text = $node.html();
+
+        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
+          $node.replaceWith($text);
+        }
+      },
      // Re-write lazy-loaded youtube videos
      iframe: function iframe($node) {
        var ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@ -2429,7 +2457,7 @@ var MediumExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['span', 'svg']
+    clean: ['span a', 'svg']
  },
  date_published: {
    selectors: [['meta[name="article:published_time"]', 'value']]
@ -6411,10 +6439,14 @@ function cleanDatePublished(dateString) {
      format = _ref.format;

  // If string is in milliseconds or seconds, convert to int and return
-  if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) {
+  if (MS_DATE_STRING.test(dateString)) {
    return new Date(_parseInt(dateString, 10)).toISOString();
  }

+  if (SEC_DATE_STRING.test(dateString)) {
+    return new Date(_parseInt(dateString, 10) * 1000).toISOString();
+  }
+
  var date = createDate(dateString, timezone, format);

  if (!date.isValid()) {
@ -7546,13 +7578,26 @@ var GenericExcerptExtractor = {
  }
 };

+var getWordCount = function getWordCount(content) {
+  var $ = cheerio.load(content);
+  var $content = $('div').first();
+  var text = normalizeSpaces($content.text());
+  return text.split(/\s/).length;
+};
+
+var getWordCountAlt = function getWordCountAlt(content) {
+  content = content.replace(/<[^>]*>/g, ' ');
+  content = content.replace(/\s+/g, ' ');
+  content = content.trim();
+  return content.split(' ').length;
+};
+
 var GenericWordCountExtractor = {
  extract: function extract(_ref) {
    var content = _ref.content;
-    var $ = cheerio.load(content);
-    var $content = $('div').first();
-    var text = normalizeSpaces($content.text());
-    return text.split(/\s/).length;
+    var count = getWordCount(content);
+    if (count === 1) count = getWordCountAlt(content);
+    return count;
  }
 };

@ -7715,7 +7760,8 @@ function select(opts) {
      _extractionOpts$defau = extractionOpts.defaultCleaner,
      defaultCleaner = _extractionOpts$defau === void 0 ? true : _extractionOpts$defau,
      allowMultiple = extractionOpts.allowMultiple;
-  var matchingSelector = findMatchingSelector($, selectors, extractHtml, allowMultiple);
+  var overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
+  var matchingSelector = findMatchingSelector($, selectors, extractHtml, overrideAllowMultiple);
  if (!matchingSelector) return null;

  function transformAndClean($node) {
@ -7988,7 +8034,7 @@ function _collectAllPages() {
            });
            return _context.abrupt("return", _objectSpread({}, result, {
              total_pages: pages,
-              pages_rendered: pages,
+              rendered_pages: pages,
              word_count: word_count
            }));

--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "@postlight/parser",
-  "version": "2.2.2",
+  "version": "2.2.3",
  "description": "Postlight Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.",
  "author": "Postlight <mercury@postlight.com>",
  "homepage": "https://reader.postlight.com",