maintenance update - october 2022 (#696)

* fix: add alternative word count method

* fix: replace pages_rendered key with rendered_pages for consistency

* fix: return first lead_image_url when multiple og:image present

* fix: properly pull image src from lazy loaded img

* fix: allow drop cap character in medium custom extractor

* fix: refined medium parser
pull/702/head^2
Michael Ashley 2 years ago committed by GitHub
parent 8ca8a5f7e5
commit ab401822aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -54,7 +54,7 @@ export default async function collectAllPages({
return {
...result,
total_pages: pages,
pages_rendered: pages,
rendered_pages: pages,
word_count,
};
}

@ -15,6 +15,13 @@ export const MediumExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
// Allow drop cap character.
'section span:first-of-type': $node => {
const $text = $node.html();
if ($text.length === 1 && /^[a-zA-Z()]+$/.test($text)) {
$node.replaceWith($text);
}
},
// Re-write lazy-loaded youtube videos
iframe: $node => {
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;
@ -55,7 +62,7 @@ export const MediumExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: ['span', 'svg'],
clean: ['span a', 'svg'],
},
date_published: {

@ -84,14 +84,14 @@ describe('TechlogIijAdJpExtractor', () => {
);
});
it('returns the pages_rendered', async () => {
it('returns the rendered_pages', async () => {
// To pass this test, fill out the pages_rendered selector
// in ./src/extractors/custom/techlog.iij.ad.jp/index.js.
const { pages_rendered } = await result;
const { rendered_pages } = await result;
// Update these values with the expected values from
// the article.
assert.equal(pages_rendered, null);
assert.equal(rendered_pages, 1);
});
it('returns the content', async () => {

@ -2,13 +2,25 @@ import cheerio from 'cheerio';
import { normalizeSpaces } from 'utils/text';
const getWordCount = content => {
const $ = cheerio.load(content);
const $content = $('div').first();
const text = normalizeSpaces($content.text());
return text.split(/\s/).length;
};
const getWordCountAlt = content => {
content = content.replace(/<[^>]*>/g, ' ');
content = content.replace(/\s+/g, ' ');
content = content.trim();
return content.split(' ').length;
};
const GenericWordCountExtractor = {
extract({ content }) {
const $ = cheerio.load(content);
const $content = $('div').first();
const text = normalizeSpaces($content.text());
return text.split(/\s/).length;
let count = getWordCount(content);
if (count === 1) count = getWordCountAlt(content);
return count;
},
};

@ -76,11 +76,13 @@ export function select(opts) {
const { selectors, defaultCleaner = true, allowMultiple } = extractionOpts;
const overrideAllowMultiple = type === 'lead_image_url' || allowMultiple;
const matchingSelector = findMatchingSelector(
$,
selectors,
extractHtml,
allowMultiple
overrideAllowMultiple
);
if (!matchingSelector) return null;

@ -78,10 +78,10 @@ describe('Parser', () => {
'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/';
const result = await Parser.parse(url, { fetchAllPages: true });
const { total_pages, pages_rendered } = result;
const { total_pages, rendered_pages } = result;
assert.equal(total_pages, 3);
assert.equal(pages_rendered, 3);
assert.equal(rendered_pages, 3);
assert.equal(result.next_page_url, `${url}2`);
});

@ -8,6 +8,17 @@ import { IS_LINK, IS_IMAGE, IS_SRCSET } from './constants';
// attribute that a is a placeholer. We need to be able to properly fill in
// the src attribute so the images are no longer lazy loaded.
export default function convertLazyLoadedImages($) {
const extractSrcFromJSON = str => {
try {
const { src } = JSON.parse(str);
if (typeof src === 'string') return src;
} catch (e) {
return false;
}
return false;
};
$('img').each((_, img) => {
const attrs = getAttrs(img);
@ -22,7 +33,13 @@ export default function convertLazyLoadedImages($) {
IS_LINK.test(value) &&
IS_IMAGE.test(value)
) {
$(img).attr('src', value);
// Is the value a JSON object? If so, we should attempt to extract the image src from the data.
const existingSrc = extractSrcFromJSON(value);
if (existingSrc) {
$(img).attr('src', existingSrc);
} else {
$(img).attr('src', value);
}
}
});
});

Loading…
Cancel
Save