feat: generic excerpt extraction

pull/3/head
Adam Pash 8 years ago
parent 457075889d
commit b3481a2c45

@ -0,0 +1,4 @@
export const EXCERPT_META_SELECTORS = [
'og:description',
'twitter:description',
];

@ -0,0 +1,28 @@
import ellipsize from 'ellipsize'
import {
extractFromMeta,
stripTags,
} from 'utils/dom';
import { EXCERPT_META_SELECTORS } from './constants';
export function clean(content, $, maxLength=200) {
content = content.replace(/[\s\n]+/g, ' ').trim()
return ellipsize(content, 200, { ellipse: '…' })
}
const GenericExcerptExtractor = {
extract({ $, content, metaCache }) {
const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache);
if (excerpt) {
return clean(stripTags(excerpt, $));
}
// Fall back to excerpting from the extracted content
const maxLength = 200
const shortContent = content.slice(0, maxLength * 5)
return clean($(shortContent).text(), $, maxLength)
}
}
export default GenericExcerptExtractor

@ -0,0 +1,84 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
default as GenericExcerptExtractor,
clean,
} from './extractor'
describe('GenericExcerptExtractor', () => {
describe('extract({ $, content, metaCache })', () => {
it('returns og:description', () => {
const actualExcerpt = "Wow this is going to be something good."
const html = `
<html>
<head>
<meta name="og:description" value="${actualExcerpt}" />
</head>
</html>
`;
const $ = cheerio.load(html);
const metaCache = ['og:description'];
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
assert.equal(excerpt, actualExcerpt);
})
it('returns twitter:description', () => {
const actualExcerpt = "Wow this is going to be something good."
const html = `
<html>
<head>
<meta name="twitter:description" value="${actualExcerpt}" />
</head>
</html>
`;
const $ = cheerio.load(html);
const metaCache = ['twitter:description'];
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
assert.equal(excerpt, actualExcerpt);
})
it('falls back to the content', () => {
const html = `
<html>
<head>
</head>
</html>
`;
const $ = cheerio.load(html);
const content = "<div><p>Wow <b>this</b> is going to be something good.</p></div>"
const metaCache = [];
const excerpt = GenericExcerptExtractor.extract({ $, content, metaCache });
assert.equal(excerpt, 'Wow this is going to be something good.');
})
})
})
describe('clean(text)', () => {
it('truncates text longer than 200 chars and trims whitespance', () => {
const longText = `
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
culpa qui officia deserunt mollit anim id est laborum.
`
const text = clean(longText)
let shouldBe = `
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
exercitation ullamco laboris nisi ut&hellip;
`
shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim()
assert.equal(text, shouldBe)
})
})

@ -8,6 +8,7 @@ import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor';
import GenericUrlExtractor from './url/extractor';
import GenericExcerptExtractor from './excerpt/extractor';
const GenericExtractor = {
// This extractor is the default for all domains
@ -20,6 +21,7 @@ const GenericExtractor = {
dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract,
urlAndDomain: GenericUrlExtractor.extract,
excerpt: GenericExcerptExtractor.extract,
extract(options) {
const { html } = options;
@ -33,9 +35,10 @@ const GenericExtractor = {
const datePublished = this.datePublished(options);
const author = this.author(options);
const content = this.content({ ...options, title });
const leadImageUrl = this.leadImageUrl(options);
const dek = this.dek(options);
const leadImageUrl = this.leadImageUrl({ ...options, content });
const dek = this.dek({ ...options, content });
const nextPageUrl = this.nextPageUrl(options);
const excerpt = this.excerpt({ ...options, content });
const { url, domain } = this.urlAndDomain(options);
return {
@ -48,6 +51,7 @@ const GenericExtractor = {
nextPageUrl,
url,
domain,
excerpt,
};
},
};

@ -1,9 +1,7 @@
import URL from 'url';
import { extractFromMeta } from 'utils/dom';
import {
CANONICAL_META_SELECTORS,
} from './constants';
import { CANONICAL_META_SELECTORS } from './constants';
function parseDomain(url) {
const parsedUrl = URL.parse(url);

@ -128,6 +128,7 @@ const RootExtractor = {
});
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
const dek = extractResult({ ...opts, type: 'dek', content });
const excerpt = extractResult({ ...opts, type: 'excerpt', content });
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
return {
title,
@ -139,6 +140,7 @@ const RootExtractor = {
nextPageUrl,
url,
domain,
excerpt,
};
},
};

Loading…
Cancel
Save