maintenance update - october 2022 (#696)

* fix: add alternative word count method * fix: replace pages_rendered key with rendered_pages for consistency * fix: return first lead_image_url when multiple og:image present * fix: properly pull image src from lazy loaded img * fix: allow drop cap character in medium custom extractor * fix: refined medium parser
2 years ago · ab401822aa
parent 8ca8a5f7e5
commit ab401822aa
7 changed files with 52 additions and 14 deletions
--- a/src/extractors/collect-all-pages.js
+++ b/src/extractors/collect-all-pages.js
@ -54,7 +54,7 @@ export default async function collectAllPages({
  return {
    ...result,
    total_pages: pages,
-    pages_rendered: pages,
+    rendered_pages: pages,
    word_count,
  };
 }
--- a/src/extractors/custom/medium.com/index.js
+++ b/src/extractors/custom/medium.com/index.js
@ -15,6 +15,13 @@ export const MediumExtractor = {
    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
    transforms: {
      // Allow drop cap character.
      'section span:first-of-type': $node => {
        const $text = $node.html();
        if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
          $node.replaceWith($text);
        }
      },
      // Re-write lazy-loaded youtube videos
      iframe: $node => {
        const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@ -55,7 +62,7 @@ export const MediumExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['span', 'svg'],
+    clean: ['span a', 'svg'],
  },
  date_published: {
--- a/src/extractors/custom/techlog.iij.ad.jp/index.test.js
+++ b/src/extractors/custom/techlog.iij.ad.jp/index.test.js
@ -84,14 +84,14 @@ describe('TechlogIijAdJpExtractor', () => {
      );
    });
-    it('returns the pages_rendered', async () => {
+    it('returns the rendered_pages', async () => {
      // To pass this test, fill out the pages_rendered selector
      // in ./src/extractors/custom/techlog.iij.ad.jp/index.js.
-      const { pages_rendered } = await result;
+      const { rendered_pages } = await result;
      // Update these values with the expected values from
      // the article.
-      assert.equal(pages_rendered, null);
+      assert.equal(rendered_pages, 1);
    });
    it('returns the content', async () => {
--- a/src/extractors/generic/word-count/extractor.js
+++ b/src/extractors/generic/word-count/extractor.js
@ -2,13 +2,25 @@ import cheerio from 'cheerio';
 import { normalizeSpaces } from 'utils/text';
 const getWordCount = content => {
  const $ = cheerio.load(content);
  const $content = $('div').first();
  const text = normalizeSpaces($content.text());
  return text.split(/\s/).length;
 };
 const getWordCountAlt = content => {
  content = content.replace(/<[^>]*>/g, ' ');
  content = content.replace(/\s+/g, ' ');
  content = content.trim();
  return content.split(' ').length;
 };
 const GenericWordCountExtractor = {
  extract({ content }) {
-    const $ = cheerio.load(content);
+    let count = getWordCount(content);
-    const $content = $('div').first();
+    if (count === 1) count = getWordCountAlt(content);
-
+    return count;
    const text = normalizeSpaces($content.text());
    return text.split(/\s/).length;
  },
 };
--- a/src/extractors/root-extractor.js
+++ b/src/extractors/root-extractor.js
@ -76,11 +76,13 @@ export function select(opts) {
  const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
  const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
  const matchingSelector = findMatchingSelector(
    $,
    selectors,
    extractHtml,
-    allowMultiple
+    overrideAllowMultiple
  );
  if (!matchingSelector) return null;
--- a/src/mercury.test.js
+++ b/src/mercury.test.js
@ -78,10 +78,10 @@ describe('Parser', () => {
        'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
      const result = await Parser.parse(url, { fetchAllPages: true });
-      const { total_pages, pages_rendered } = result;
+      const { total_pages, rendered_pages } = result;
      assert.equal(total_pages, 3);
-      assert.equal(pages_rendered, 3);
+      assert.equal(rendered_pages, 3);
      assert.equal(result.next_page_url, `${url}2`);
    });
--- a/src/resource/utils/dom/convert-lazy-loaded-images.js
+++ b/src/resource/utils/dom/convert-lazy-loaded-images.js
@ -8,6 +8,17 @@ import { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants';
 // attribute that a is a placeholer. We need to be able to properly fill in
 // the src attribute so the images are no longer lazy loaded.
 export default function convertLazyLoadedImages($) {
  const extractSrcFromJSON = str => {
    try {
      const { src } = JSON.parse(str);
      if (typeof src === 'string') return src;
    } catch (e) {
      return false;
    }
    return false;
  };
  $('img').each((_, img) => {
    const attrs = getAttrs(img);
@ -22,7 +33,13 @@ export default function convertLazyLoadedImages($) {
        IS_LINK.test(value) &&
        IS_IMAGE.test(value)
      ) {
-        $(img).attr('src', value);
+        // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
        const existingSrc = extractSrcFromJSON(value);
        if (existingSrc) {
          $(img).attr('src', existingSrc);
        } else {
          $(img).attr('src', value);
        }
      }
    });
  });