maintenance update - october 2022 (#696)

* fix: add alternative word count method

* fix: replace pages_rendered key with rendered_pages for consistency

* fix: return first lead_image_url when multiple og:image present

* fix: properly pull image src from lazy loaded img

* fix: allow drop cap character in medium custom extractor

* fix: refined medium parser
pull/702/head^2
Michael Ashley 2 years ago committed by GitHub
parent 8ca8a5f7e5
commit ab401822aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -54,7 +54,7 @@ export default async function collectAllPages({
return { return {
...result, ...result,
total_pages: pages, total_pages: pages,
pages_rendered: pages, rendered_pages: pages,
word_count, word_count,
}; };
} }

@ -15,6 +15,13 @@ export const MediumExtractor = {
// Is there anything in the content you selected that needs transformed // Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images // before it's consumable content? E.g., unusual lazy loaded images
transforms: { transforms: {
// Allow drop cap character.
'section span:first-of-type': $node => {
const $text = $node.html();
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
$node.replaceWith($text);
}
},
// Re-write lazy-loaded youtube videos // Re-write lazy-loaded youtube videos
iframe: $node => { iframe: $node => {
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//; const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@ -55,7 +62,7 @@ export const MediumExtractor = {
// Is there anything that is in the result that shouldn't be? // Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from // The clean selectors will remove anything that matches from
// the result // the result
clean: ['span', 'svg'], clean: ['span a', 'svg'],
}, },
date_published: { date_published: {

@ -84,14 +84,14 @@ describe('TechlogIijAdJpExtractor', () => {
); );
}); });
it('returns the pages_rendered', async () => { it('returns the rendered_pages', async () => {
// To pass this test, fill out the pages_rendered selector // To pass this test, fill out the pages_rendered selector
// in ./src/extractors/custom/techlog.iij.ad.jp/index.js. // in ./src/extractors/custom/techlog.iij.ad.jp/index.js.
const { pages_rendered } = await result; const { rendered_pages } = await result;
// Update these values with the expected values from // Update these values with the expected values from
// the article. // the article.
assert.equal(pages_rendered, null); assert.equal(rendered_pages, 1);
}); });
it('returns the content', async () => { it('returns the content', async () => {

@ -2,13 +2,25 @@ import cheerio from 'cheerio';
import { normalizeSpaces } from 'utils/text'; import { normalizeSpaces } from 'utils/text';
const getWordCount = content => {
const $ = cheerio.load(content);
const $content = $('div').first();
const text = normalizeSpaces($content.text());
return text.split(/\s/).length;
};
const getWordCountAlt = content => {
content = content.replace(/<[^>]*>/g, ' ');
content = content.replace(/\s+/g, ' ');
content = content.trim();
return content.split(' ').length;
};
const GenericWordCountExtractor = { const GenericWordCountExtractor = {
extract({ content }) { extract({ content }) {
const $ = cheerio.load(content); let count = getWordCount(content);
const $content = $('div').first(); if (count === 1) count = getWordCountAlt(content);
return count;
const text = normalizeSpaces($content.text());
return text.split(/\s/).length;
}, },
}; };

@ -76,11 +76,13 @@ export function select(opts) {
const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts; const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
const matchingSelector = findMatchingSelector( const matchingSelector = findMatchingSelector(
$, $,
selectors, selectors,
extractHtml, extractHtml,
allowMultiple overrideAllowMultiple
); );
if (!matchingSelector) return null; if (!matchingSelector) return null;

@ -78,10 +78,10 @@ describe('Parser', () => {
'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const result = await Parser.parse(url, { fetchAllPages: true }); const result = await Parser.parse(url, { fetchAllPages: true });
const { total_pages, pages_rendered } = result; const { total_pages, rendered_pages } = result;
assert.equal(total_pages, 3); assert.equal(total_pages, 3);
assert.equal(pages_rendered, 3); assert.equal(rendered_pages, 3);
assert.equal(result.next_page_url, `${url}2`); assert.equal(result.next_page_url, `${url}2`);
}); });

@ -8,6 +8,17 @@ import { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants';
// attribute that a is a placeholer. We need to be able to properly fill in // attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded. // the src attribute so the images are no longer lazy loaded.
export default function convertLazyLoadedImages($) { export default function convertLazyLoadedImages($) {
const extractSrcFromJSON = str => {
try {
const { src } = JSON.parse(str);
if (typeof src === 'string') return src;
} catch (e) {
return false;
}
return false;
};
$('img').each((_, img) => { $('img').each((_, img) => {
const attrs = getAttrs(img); const attrs = getAttrs(img);
@ -22,7 +33,13 @@ export default function convertLazyLoadedImages($) {
IS_LINK.test(value) && IS_LINK.test(value) &&
IS_IMAGE.test(value) IS_IMAGE.test(value)
) { ) {
$(img).attr('src', value); // Is the value a JSON object? If so, we should attempt to extract the image src from the data.
const existingSrc = extractSrcFromJSON(value);
if (existingSrc) {
$(img).attr('src', existingSrc);
} else {
$(img).attr('src', value);
}
} }
}); });
}); });

Loading…
Cancel
Save