feat: generic excerpt extraction

This commit is contained in:
Adam Pash 2016-09-14 14:13:59 -04:00
parent 457075889d
commit b3481a2c45
6 changed files with 125 additions and 5 deletions

View File

@ -0,0 +1,4 @@
export const EXCERPT_META_SELECTORS = [
'og:description',
'twitter:description',
];

View File

@ -0,0 +1,28 @@
import ellipsize from 'ellipsize'
import {
extractFromMeta,
stripTags,
} from 'utils/dom';
import { EXCERPT_META_SELECTORS } from './constants';
export function clean(content, $, maxLength=200) {
content = content.replace(/[\s\n]+/g, ' ').trim()
return ellipsize(content, 200, { ellipse: '…' })
}
const GenericExcerptExtractor = {
extract({ $, content, metaCache }) {
const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache);
if (excerpt) {
return clean(stripTags(excerpt, $));
}
// Fall back to excerpting from the extracted content
const maxLength = 200
const shortContent = content.slice(0, maxLength * 5)
return clean($(shortContent).text(), $, maxLength)
}
}
export default GenericExcerptExtractor

View File

@ -0,0 +1,84 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
default as GenericExcerptExtractor,
clean,
} from './extractor'
describe('GenericExcerptExtractor', () => {
describe('extract({ $, content, metaCache })', () => {
it('returns og:description', () => {
const actualExcerpt = "Wow this is going to be something good."
const html = `
<html>
<head>
<meta name="og:description" value="${actualExcerpt}" />
</head>
</html>
`;
const $ = cheerio.load(html);
const metaCache = ['og:description'];
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
assert.equal(excerpt, actualExcerpt);
})
it('returns twitter:description', () => {
const actualExcerpt = "Wow this is going to be something good."
const html = `
<html>
<head>
<meta name="twitter:description" value="${actualExcerpt}" />
</head>
</html>
`;
const $ = cheerio.load(html);
const metaCache = ['twitter:description'];
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
assert.equal(excerpt, actualExcerpt);
})
it('falls back to the content', () => {
const html = `
<html>
<head>
</head>
</html>
`;
const $ = cheerio.load(html);
const content = "<div><p>Wow <b>this</b> is going to be something good.</p></div>"
const metaCache = [];
const excerpt = GenericExcerptExtractor.extract({ $, content, metaCache });
assert.equal(excerpt, 'Wow this is going to be something good.');
})
})
})
describe('clean(text)', () => {
it('truncates text longer than 200 chars and trims whitespance', () => {
const longText = `
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
`
const text = clean(longText)
let shouldBe = `
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
exercitation ullamco laboris nisi ut&hellip;
`
shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim()
assert.equal(text, shouldBe)
})
})

View File

@ -8,6 +8,7 @@ import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'; import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor'; import GenericNextPageUrlExtractor from './next-page-url/extractor';
import GenericUrlExtractor from './url/extractor'; import GenericUrlExtractor from './url/extractor';
import GenericExcerptExtractor from './excerpt/extractor';
const GenericExtractor = { const GenericExtractor = {
// This extractor is the default for all domains // This extractor is the default for all domains
@ -20,6 +21,7 @@ const GenericExtractor = {
dek: GenericDekExtractor.extract, dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract, nextPageUrl: GenericNextPageUrlExtractor.extract,
urlAndDomain: GenericUrlExtractor.extract, urlAndDomain: GenericUrlExtractor.extract,
excerpt: GenericExcerptExtractor.extract,
extract(options) { extract(options) {
const { html } = options; const { html } = options;
@ -33,9 +35,10 @@ const GenericExtractor = {
const datePublished = this.datePublished(options); const datePublished = this.datePublished(options);
const author = this.author(options); const author = this.author(options);
const content = this.content({ ...options, title }); const content = this.content({ ...options, title });
const leadImageUrl = this.leadImageUrl(options); const leadImageUrl = this.leadImageUrl({ ...options, content });
const dek = this.dek(options); const dek = this.dek({ ...options, content });
const nextPageUrl = this.nextPageUrl(options); const nextPageUrl = this.nextPageUrl(options);
const excerpt = this.excerpt({ ...options, content });
const { url, domain } = this.urlAndDomain(options); const { url, domain } = this.urlAndDomain(options);
return { return {
@ -48,6 +51,7 @@ const GenericExtractor = {
nextPageUrl, nextPageUrl,
url, url,
domain, domain,
excerpt,
}; };
}, },
}; };

View File

@ -1,9 +1,7 @@
import URL from 'url'; import URL from 'url';
import { extractFromMeta } from 'utils/dom'; import { extractFromMeta } from 'utils/dom';
import { import { CANONICAL_META_SELECTORS } from './constants';
CANONICAL_META_SELECTORS,
} from './constants';
function parseDomain(url) { function parseDomain(url) {
const parsedUrl = URL.parse(url); const parsedUrl = URL.parse(url);

View File

@ -128,6 +128,7 @@ const RootExtractor = {
}); });
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content }); const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
const dek = extractResult({ ...opts, type: 'dek', content }); const dek = extractResult({ ...opts, type: 'dek', content });
const excerpt = extractResult({ ...opts, type: 'excerpt', content });
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' }); const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
return { return {
title, title,
@ -139,6 +140,7 @@ const RootExtractor = {
nextPageUrl, nextPageUrl,
url, url,
domain, domain,
excerpt,
}; };
}, },
}; };