From b3481a2c45ca5ca5e243c46bdbd4a308d8dd47af Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Wed, 14 Sep 2016 14:13:59 -0400 Subject: [PATCH] feat: generic excerpt extraction --- src/extractors/generic/excerpt/constants.js | 4 + src/extractors/generic/excerpt/extractor.js | 28 +++++++ .../generic/excerpt/extractor.test.js | 84 +++++++++++++++++++ src/extractors/generic/index.js | 8 +- src/extractors/generic/url/extractor.js | 4 +- src/extractors/root-extractor.js | 2 + 6 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 src/extractors/generic/excerpt/constants.js create mode 100644 src/extractors/generic/excerpt/extractor.js create mode 100644 src/extractors/generic/excerpt/extractor.test.js diff --git a/src/extractors/generic/excerpt/constants.js b/src/extractors/generic/excerpt/constants.js new file mode 100644 index 00000000..4e8ed239 --- /dev/null +++ b/src/extractors/generic/excerpt/constants.js @@ -0,0 +1,4 @@ +export const EXCERPT_META_SELECTORS = [ + 'og:description', + 'twitter:description', +]; diff --git a/src/extractors/generic/excerpt/extractor.js b/src/extractors/generic/excerpt/extractor.js new file mode 100644 index 00000000..5bf183a7 --- /dev/null +++ b/src/extractors/generic/excerpt/extractor.js @@ -0,0 +1,28 @@ +import ellipsize from 'ellipsize' + +import { + extractFromMeta, + stripTags, +} from 'utils/dom'; + +import { EXCERPT_META_SELECTORS } from './constants'; + +export function clean(content, $, maxLength=200) { + content = content.replace(/[\s\n]+/g, ' ').trim() + return ellipsize(content, 200, { ellipse: '…' }) +} + +const GenericExcerptExtractor = { + extract({ $, content, metaCache }) { + const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache); + if (excerpt) { + return clean(stripTags(excerpt, $)); + } + // Fall back to excerpting from the extracted content + const maxLength = 200 + const shortContent = content.slice(0, maxLength * 5) + return clean($(shortContent).text(), $, maxLength) + } +} + +export default GenericExcerptExtractor diff --git a/src/extractors/generic/excerpt/extractor.test.js b/src/extractors/generic/excerpt/extractor.test.js new file mode 100644 index 00000000..843e3612 --- /dev/null +++ b/src/extractors/generic/excerpt/extractor.test.js @@ -0,0 +1,84 @@ +import assert from 'assert' +import cheerio from 'cheerio' + +import { + default as GenericExcerptExtractor, + clean, +} from './extractor' + +describe('GenericExcerptExtractor', () => { + describe('extract({ $, content, metaCache })', () => { + it('returns og:description', () => { + const actualExcerpt = "Wow this is going to be something good." + const html = ` + + + + + + `; + const $ = cheerio.load(html); + const metaCache = ['og:description']; + + const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache }); + + assert.equal(excerpt, actualExcerpt); + }) + + it('returns twitter:description', () => { + const actualExcerpt = "Wow this is going to be something good." + const html = ` + + + + + + `; + const $ = cheerio.load(html); + const metaCache = ['twitter:description']; + + const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache }); + + assert.equal(excerpt, actualExcerpt); + }) + + it('falls back to the content', () => { + const html = ` + + + + + `; + const $ = cheerio.load(html); + const content = "

Wow this is going to be something good.

" + const metaCache = []; + + const excerpt = GenericExcerptExtractor.extract({ $, content, metaCache }); + + assert.equal(excerpt, 'Wow this is going to be something good.'); + }) + + }) +}) + +describe('clean(text)', () => { + it('truncates text longer than 200 chars and trims whitespance', () => { + const longText = ` + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis + nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu + fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in + culpa qui officia deserunt mollit anim id est laborum. + ` + const text = clean(longText) + let shouldBe = ` + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor + incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud + exercitation ullamco laboris nisi ut… + ` + shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim() + + assert.equal(text, shouldBe) + }) +}) diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js index 0bee6ecd..7848992b 100644 --- a/src/extractors/generic/index.js +++ b/src/extractors/generic/index.js @@ -8,6 +8,7 @@ import GenericDekExtractor from './dek/extractor'; import GenericLeadImageUrlExtractor from './lead-image-url/extractor'; import GenericNextPageUrlExtractor from './next-page-url/extractor'; import GenericUrlExtractor from './url/extractor'; +import GenericExcerptExtractor from './excerpt/extractor'; const GenericExtractor = { // This extractor is the default for all domains @@ -20,6 +21,7 @@ const GenericExtractor = { dek: GenericDekExtractor.extract, nextPageUrl: GenericNextPageUrlExtractor.extract, urlAndDomain: GenericUrlExtractor.extract, + excerpt: GenericExcerptExtractor.extract, extract(options) { const { html } = options; @@ -33,9 +35,10 @@ const GenericExtractor = { const datePublished = this.datePublished(options); const author = this.author(options); const content = this.content({ ...options, title }); - const leadImageUrl = this.leadImageUrl(options); - const dek = this.dek(options); + const leadImageUrl = this.leadImageUrl({ ...options, content }); + const dek = this.dek({ ...options, content }); const nextPageUrl = this.nextPageUrl(options); + const excerpt = this.excerpt({ ...options, content }); const { url, domain } = this.urlAndDomain(options); return { @@ -48,6 +51,7 @@ const GenericExtractor = { nextPageUrl, url, domain, + excerpt, }; }, }; diff --git a/src/extractors/generic/url/extractor.js b/src/extractors/generic/url/extractor.js index 8347abd0..53b02c73 100644 --- a/src/extractors/generic/url/extractor.js +++ b/src/extractors/generic/url/extractor.js @@ -1,9 +1,7 @@ import URL from 'url'; import { extractFromMeta } from 'utils/dom'; -import { - CANONICAL_META_SELECTORS, -} from './constants'; +import { CANONICAL_META_SELECTORS } from './constants'; function parseDomain(url) { const parsedUrl = URL.parse(url); diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index ed531b85..18d8a98a 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -128,6 +128,7 @@ const RootExtractor = { }); const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content }); const dek = extractResult({ ...opts, type: 'dek', content }); + const excerpt = extractResult({ ...opts, type: 'excerpt', content }); const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' }); return { title, @@ -139,6 +140,7 @@ const RootExtractor = { nextPageUrl, url, domain, + excerpt, }; }, };