pull/724/merge
Naftali Beder 1 year ago committed by GitHub
commit 1237f9ead6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -10,7 +10,7 @@ export const MediumExtractor = {
},
content: {
selectors: ['article'],
selectors: ['article', 'article>div[class=l]'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
@ -22,6 +22,7 @@ export const MediumExtractor = {
$node.replaceWith($text);
}
},
// Re-write lazy-loaded youtube videos
iframe: $node => {
const ytRe = /https:\/\/i.embed.ly\/.+url=https:\/\/i\.ytimg\.com\/vi\/(\w+)\//;

@ -4,18 +4,18 @@ import cheerio from 'cheerio';
import Mercury from 'mercury';
import getExtractor from 'extractors/get-extractor';
import { excerptContent } from 'utils/text';
import { excerptContentRange } from 'utils/text';
const fs = require('fs');
describe('MediumExtractor', () => {
describe('extract medium article - the wtf economy', () => {
describe('initial test case', () => {
let result;
let url;
beforeAll(() => {
url =
'https://medium.com/the-wtf-economy/wtf-whats-the-future-e52ab9515573#.ilwrgwsks';
const html = fs.readFileSync('./fixtures/medium.com.html');
const html = fs.readFileSync('./fixtures/medium.com--future.html');
result = Mercury.parse(url, { html, fallback: false });
});
@ -70,28 +70,25 @@ describe('MediumExtractor', () => {
const { content } = await result;
const $ = cheerio.load(content || '');
const text = $.text();
const first13 = excerptContent(
$('*')
.first()
.text(),
13
);
const slice1 = excerptContentRange(text, 0, 13);
assert.equal(
first13,
slice1,
'Last Thursday, I had the honor to be one of the warmup acts'
);
const slice2 = excerptContentRange(text, -37, -28);
assert.equal(slice2, 'Can we hand off a better world to our');
});
});
describe('works with another url', () => {
describe('extract medium article - the mtg color wheel', () => {
let result;
let url;
beforeAll(() => {
url =
'https://medium.com/@JakobUlbrich/flag-attributes-in-android-how-to-use-them-ac4ec8aee7d1#.h949wjmyw';
const html = fs.readFileSync('./fixtures/medium.com--another.html');
url = 'https://humanparts.medium.com/the-mtg-color-wheel-c9700a7cf36d';
const html = fs.readFileSync('./fixtures/medium.com--mtg.html');
result = Mercury.parse(url, { html, fallback: false });
});
@ -99,12 +96,18 @@ describe('MediumExtractor', () => {
const { content } = await result;
const $ = cheerio.load(content || '');
const text = $.text();
const first13 = excerptContent($.text(), 13);
const slice1 = excerptContentRange(text, 18, 18 + 12);
assert.equal(
slice1,
'Magic: The Gathering is a fantasy card game by Richard Garfield, Ph.D.'
);
const slice2 = excerptContentRange(text, -16);
assert.equal(
first13,
'Im sure you have seen something like the following line very often while'
slice2,
'What sorts of things will I say? What sorts of things are likely to land flat?'
);
});
});

@ -1,7 +1,20 @@
export default function excerptContent(content, words = 10) {
export function excerptContent(content, words = 10) {
return content
.trim()
.split(/\s+/)
.slice(0, words)
.join(' ');
}
export function excerptContentRange(content, start, end) {
return content
.trim()
.split(/\s+/)
.slice(start, end)
.join(' ');
}
export default {
excerptContent,
excerptContentRange,
};

@ -1,5 +1,5 @@
import assert from 'assert';
import excerptContent from './excerpt-content';
import { excerptContent } from './excerpt-content';
describe('excerptContent(content, words)', () => {
it('extracts the requested number of words from content', () => {

@ -4,5 +4,5 @@ export { default as pageNumFromUrl } from './page-num-from-url';
export { default as removeAnchor } from './remove-anchor';
export { default as articleBaseUrl } from './article-base-url';
export { default as hasSentenceEnd } from './has-sentence-end';
export { default as excerptContent } from './excerpt-content';
export { excerptContent, excerptContentRange } from './excerpt-content';
export { default as getEncoding } from './get-encoding';

Loading…
Cancel
Save