diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js index 7848992b..8eb5f65c 100644 --- a/src/extractors/generic/index.js +++ b/src/extractors/generic/index.js @@ -9,6 +9,7 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'; import GenericNextPageUrlExtractor from './next-page-url/extractor'; import GenericUrlExtractor from './url/extractor'; import GenericExcerptExtractor from './excerpt/extractor'; +import GenericWordCountExtractor from './word-count/extractor'; const GenericExtractor = { // This extractor is the default for all domains @@ -22,6 +23,7 @@ const GenericExtractor = { nextPageUrl: GenericNextPageUrlExtractor.extract, urlAndDomain: GenericUrlExtractor.extract, excerpt: GenericExcerptExtractor.extract, + wordCount: GenericWordCountExtractor.extract, extract(options) { const { html } = options; @@ -39,6 +41,7 @@ const GenericExtractor = { const dek = this.dek({ ...options, content }); const nextPageUrl = this.nextPageUrl(options); const excerpt = this.excerpt({ ...options, content }); + const wordCount = this.excerpt({ ...options, content }); const { url, domain } = this.urlAndDomain(options); return { @@ -52,6 +55,7 @@ const GenericExtractor = { url, domain, excerpt, + wordCount, }; }, }; diff --git a/src/extractors/generic/word-count/extractor.js b/src/extractors/generic/word-count/extractor.js new file mode 100644 index 00000000..467ae6cb --- /dev/null +++ b/src/extractors/generic/word-count/extractor.js @@ -0,0 +1,14 @@ +import cheerio from 'cheerio' + +import { normalizeSpaces } from 'utils/text' + +const GenericWordCountExtractor = { + extract({ content }) { + const $ = cheerio.load(content) + + const text = normalizeSpaces($('div').first().text()) + return text.split(/\s/).length + }, +} + +export default GenericWordCountExtractor diff --git a/src/extractors/generic/word-count/extractor.test.js b/src/extractors/generic/word-count/extractor.test.js new file mode 100644 index 00000000..22c415c2 --- /dev/null +++ b/src/extractors/generic/word-count/extractor.test.js @@ -0,0 +1,21 @@ +import assert from 'assert' + +import GenericWordCountExtractor from './extractor' + +describe('GenericWordCountExtractor', () => { + describe('extact({ content })', () => { + it('counts words', () => { + const content = ` +
+

One two three.

+

Four five six.

+

Seven eight nine.

+

Ten eleven twelve.

+ ` + + const wordCount = GenericWordCountExtractor.extract({ content }) + + assert.equal(wordCount, 12) + }) + }) +}) diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index 18d8a98a..08a9dbaf 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -129,7 +129,9 @@ const RootExtractor = { const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content }); const dek = extractResult({ ...opts, type: 'dek', content }); const excerpt = extractResult({ ...opts, type: 'excerpt', content }); + const wordCount = extractResult({ ...opts, type: 'wordCount', content }); const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' }); + return { title, content, @@ -141,6 +143,7 @@ const RootExtractor = { url, domain, excerpt, + wordCount, }; }, }; diff --git a/src/extractors/root-extractor.test.js b/src/extractors/root-extractor.test.js index 76b1199a..649630e1 100644 --- a/src/extractors/root-extractor.test.js +++ b/src/extractors/root-extractor.test.js @@ -21,12 +21,14 @@ describe('RootExtractor', () => { const { url, title, + wordCount, } = RootExtractor.extract( NYMagExtractor, { url: fullUrl, html, $, metaCache: [] } ); assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation'); assert.equal(url, fullUrl); + assert.equal(wordCount, 727); }); }); diff --git a/src/utils/dom/fixtures/html.js b/src/utils/dom/fixtures/html.js index 97dc6148..a8ab7971 100644 --- a/src/utils/dom/fixtures/html.js +++ b/src/utils/dom/fixtures/html.js @@ -511,6 +511,9 @@ const HTML = {
  • Lose this one
  • Lose this one
  • Lose this one
  • +
  • Lose this one
  • +
  • Lose this one
  • +
  • Lose this one
  • `, diff --git a/src/utils/text/normalize-spaces.js b/src/utils/text/normalize-spaces.js index 130d9bd5..f0c42b66 100644 --- a/src/utils/text/normalize-spaces.js +++ b/src/utils/text/normalize-spaces.js @@ -1,4 +1,4 @@ -const NORMALIZE_RE = /\s{2,}/; +const NORMALIZE_RE = /\s{2,}/g; export default function normalizeSpaces(text) { return text.replace(NORMALIZE_RE, ' ').trim();