From ab401822aa02bbefe5189618a7acab427342c51c Mon Sep 17 00:00:00 2001 From: Michael Ashley Date: Fri, 7 Oct 2022 10:47:41 -0500 Subject: [PATCH] maintenance update - october 2022 (#696) * fix: add alternative word count method * fix: replace pages_rendered key with rendered_pages for consistency * fix: return first lead_image_url when multiple og:image present * fix: properly pull image src from lazy loaded img * fix: allow drop cap character in medium custom extractor * fix: refined medium parser --- src/extractors/collect-all-pages.js | 2 +- src/extractors/custom/medium.com/index.js | 9 +++++++- .../custom/techlog.iij.ad.jp/index.test.js | 6 ++--- .../generic/word-count/extractor.js | 22 ++++++++++++++----- src/extractors/root-extractor.js | 4 +++- src/mercury.test.js | 4 ++-- .../utils/dom/convert-lazy-loaded-images.js | 19 +++++++++++++++- 7 files changed, 52 insertions(+), 14 deletions(-) diff --git a/src/extractors/collect-all-pages.js b/src/extractors/collect-all-pages.js index d33c423b..aae2fb06 100644 --- a/src/extractors/collect-all-pages.js +++ b/src/extractors/collect-all-pages.js @@ -54,7 +54,7 @@ export default async function collectAllPages({ return { ...result, total_pages: pages, - pages_rendered: pages, + rendered_pages: pages, word_count, }; } diff --git a/src/extractors/custom/medium.com/index.js b/src/extractors/custom/medium.com/index.js index 01fc9400..09384ba3 100644 --- a/src/extractors/custom/medium.com/index.js +++ b/src/extractors/custom/medium.com/index.js @@ -15,6 +15,13 @@ export const MediumExtractor = { // Is there anything in the content you selected that needs transformed // before it's consumable content? E.g., unusual lazy loaded images transforms: { + // Allow drop cap character. + 'section span:first-of-type': $node => { + const $text = $node.html(); + if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) { + $node.replaceWith($text); + } + }, // Re-write lazy-loaded youtube videos iframe: $node => { const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//; @@ -55,7 +62,7 @@ export const MediumExtractor = { // Is there anything that is in the result that shouldn't be? // The clean selectors will remove anything that matches from // the result - clean: ['span', 'svg'], + clean: ['span a', 'svg'], }, date_published: { diff --git a/src/extractors/custom/techlog.iij.ad.jp/index.test.js b/src/extractors/custom/techlog.iij.ad.jp/index.test.js index 3a914fec..16fa9db8 100644 --- a/src/extractors/custom/techlog.iij.ad.jp/index.test.js +++ b/src/extractors/custom/techlog.iij.ad.jp/index.test.js @@ -84,14 +84,14 @@ describe('TechlogIijAdJpExtractor', () => { ); }); - it('returns the pages_rendered', async () => { + it('returns the rendered_pages', async () => { // To pass this test, fill out the pages_rendered selector // in ./src/extractors/custom/techlog.iij.ad.jp/index.js. - const { pages_rendered } = await result; + const { rendered_pages } = await result; // Update these values with the expected values from // the article. - assert.equal(pages_rendered, null); + assert.equal(rendered_pages, 1); }); it('returns the content', async () => { diff --git a/src/extractors/generic/word-count/extractor.js b/src/extractors/generic/word-count/extractor.js index d0e86c8a..792eaf8a 100644 --- a/src/extractors/generic/word-count/extractor.js +++ b/src/extractors/generic/word-count/extractor.js @@ -2,13 +2,25 @@ import cheerio from 'cheerio'; import { normalizeSpaces } from 'utils/text'; +const getWordCount = content => { + const $ = cheerio.load(content); + const $content = $('div').first(); + const text = normalizeSpaces($content.text()); + return text.split(/\s/).length; +}; + +const getWordCountAlt = content => { + content = content.replace(/<[^>]*>/g, ' '); + content = content.replace(/\s+/g, ' '); + content = content.trim(); + return content.split(' ').length; +}; + const GenericWordCountExtractor = { extract({ content }) { - const $ = cheerio.load(content); - const $content = $('div').first(); - - const text = normalizeSpaces($content.text()); - return text.split(/\s/).length; + let count = getWordCount(content); + if (count === 1) count = getWordCountAlt(content); + return count; }, }; diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index 8efe7fcf..27ba24b1 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -76,11 +76,13 @@ export function select(opts) { const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts; + const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple; + const matchingSelector = findMatchingSelector( $, selectors, extractHtml, - allowMultiple + overrideAllowMultiple ); if (!matchingSelector) return null; diff --git a/src/mercury.test.js b/src/mercury.test.js index 3d83130c..d51e0342 100644 --- a/src/mercury.test.js +++ b/src/mercury.test.js @@ -78,10 +78,10 @@ describe('Parser', () => { 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; const result = await Parser.parse(url, { fetchAllPages: true }); - const { total_pages, pages_rendered } = result; + const { total_pages, rendered_pages } = result; assert.equal(total_pages, 3); - assert.equal(pages_rendered, 3); + assert.equal(rendered_pages, 3); assert.equal(result.next_page_url, `${url}2`); }); diff --git a/src/resource/utils/dom/convert-lazy-loaded-images.js b/src/resource/utils/dom/convert-lazy-loaded-images.js index 03eb6340..aa8aa5d1 100644 --- a/src/resource/utils/dom/convert-lazy-loaded-images.js +++ b/src/resource/utils/dom/convert-lazy-loaded-images.js @@ -8,6 +8,17 @@ import { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants'; // attribute that a is a placeholer. We need to be able to properly fill in // the src attribute so the images are no longer lazy loaded. export default function convertLazyLoadedImages($) { + const extractSrcFromJSON = str => { + try { + const { src } = JSON.parse(str); + if (typeof src === 'string') return src; + } catch (e) { + return false; + } + + return false; + }; + $('img').each((_, img) => { const attrs = getAttrs(img); @@ -22,7 +33,13 @@ export default function convertLazyLoadedImages($) { IS_LINK.test(value) && IS_IMAGE.test(value) ) { - $(img).attr('src', value); + // Is the value a JSON object? If so, we should attempt to extract the image src from the data. + const existingSrc = extractSrcFromJSON(value); + if (existingSrc) { + $(img).attr('src', existingSrc); + } else { + $(img).attr('src', value); + } } }); });