From daa9266182e93412a55b8e6cd493a00e22475935 Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Wed, 14 Sep 2016 14:58:08 -0400 Subject: [PATCH] feat: generic extractor for word count Squashed commit of the following: commit 0aba26ef9efba71a72c76fa351a9037e97fc1e9e Author: Adam Pash Date: Wed Sep 14 14:56:45 2016 -0400 fix: normalizeSpaces regex fix broke a test commit 07d60c1c8c6599d6c94d92e5a70649c28d03d6ea Author: Adam Pash Date: Wed Sep 14 14:52:41 2016 -0400 feat: generic extractor for word count --- src/extractors/generic/index.js | 4 ++++ .../generic/word-count/extractor.js | 14 +++++++++++++ .../generic/word-count/extractor.test.js | 21 +++++++++++++++++++ src/extractors/root-extractor.js | 3 +++ src/extractors/root-extractor.test.js | 2 ++ src/utils/dom/fixtures/html.js | 3 +++ src/utils/text/normalize-spaces.js | 2 +- 7 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 src/extractors/generic/word-count/extractor.js create mode 100644 src/extractors/generic/word-count/extractor.test.js diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js index 7848992b..8eb5f65c 100644 --- a/src/extractors/generic/index.js +++ b/src/extractors/generic/index.js @@ -9,6 +9,7 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'; import GenericNextPageUrlExtractor from './next-page-url/extractor'; import GenericUrlExtractor from './url/extractor'; import GenericExcerptExtractor from './excerpt/extractor'; +import GenericWordCountExtractor from './word-count/extractor'; const GenericExtractor = { // This extractor is the default for all domains @@ -22,6 +23,7 @@ const GenericExtractor = { nextPageUrl: GenericNextPageUrlExtractor.extract, urlAndDomain: GenericUrlExtractor.extract, excerpt: GenericExcerptExtractor.extract, + wordCount: GenericWordCountExtractor.extract, extract(options) { const { html } = options; @@ -39,6 +41,7 @@ const GenericExtractor = { const dek = this.dek({ ...options, content }); const nextPageUrl = this.nextPageUrl(options); const excerpt = this.excerpt({ ...options, content }); + const wordCount = this.excerpt({ ...options, content }); const { url, domain } = this.urlAndDomain(options); return { @@ -52,6 +55,7 @@ const GenericExtractor = { url, domain, excerpt, + wordCount, }; }, }; diff --git a/src/extractors/generic/word-count/extractor.js b/src/extractors/generic/word-count/extractor.js new file mode 100644 index 00000000..467ae6cb --- /dev/null +++ b/src/extractors/generic/word-count/extractor.js @@ -0,0 +1,14 @@ +import cheerio from 'cheerio' + +import { normalizeSpaces } from 'utils/text' + +const GenericWordCountExtractor = { + extract({ content }) { + const $ = cheerio.load(content) + + const text = normalizeSpaces($('div').first().text()) + return text.split(/\s/).length + }, +} + +export default GenericWordCountExtractor diff --git a/src/extractors/generic/word-count/extractor.test.js b/src/extractors/generic/word-count/extractor.test.js new file mode 100644 index 00000000..22c415c2 --- /dev/null +++ b/src/extractors/generic/word-count/extractor.test.js @@ -0,0 +1,21 @@ +import assert from 'assert' + +import GenericWordCountExtractor from './extractor' + +describe('GenericWordCountExtractor', () => { + describe('extact({ content })', () => { + it('counts words', () => { + const content = ` +
+

One two three.

+

Four five six.

+

Seven eight nine.

+

Ten eleven twelve.

+ ` + + const wordCount = GenericWordCountExtractor.extract({ content }) + + assert.equal(wordCount, 12) + }) + }) +}) diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index 18d8a98a..08a9dbaf 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -129,7 +129,9 @@ const RootExtractor = { const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content }); const dek = extractResult({ ...opts, type: 'dek', content }); const excerpt = extractResult({ ...opts, type: 'excerpt', content }); + const wordCount = extractResult({ ...opts, type: 'wordCount', content }); const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' }); + return { title, content, @@ -141,6 +143,7 @@ const RootExtractor = { url, domain, excerpt, + wordCount, }; }, }; diff --git a/src/extractors/root-extractor.test.js b/src/extractors/root-extractor.test.js index 76b1199a..649630e1 100644 --- a/src/extractors/root-extractor.test.js +++ b/src/extractors/root-extractor.test.js @@ -21,12 +21,14 @@ describe('RootExtractor', () => { const { url, title, + wordCount, } = RootExtractor.extract( NYMagExtractor, { url: fullUrl, html, $, metaCache: [] } ); assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation'); assert.equal(url, fullUrl); + assert.equal(wordCount, 727); }); }); diff --git a/src/utils/dom/fixtures/html.js b/src/utils/dom/fixtures/html.js index 97dc6148..a8ab7971 100644 --- a/src/utils/dom/fixtures/html.js +++ b/src/utils/dom/fixtures/html.js @@ -511,6 +511,9 @@ const HTML = {
  • Lose this one
  • Lose this one
  • Lose this one
  • +
  • Lose this one
  • +
  • Lose this one
  • +
  • Lose this one
  • `, diff --git a/src/utils/text/normalize-spaces.js b/src/utils/text/normalize-spaces.js index 130d9bd5..f0c42b66 100644 --- a/src/utils/text/normalize-spaces.js +++ b/src/utils/text/normalize-spaces.js @@ -1,4 +1,4 @@ -const NORMALIZE_RE = /\s{2,}/; +const NORMALIZE_RE = /\s{2,}/g; export default function normalizeSpaces(text) { return text.replace(NORMALIZE_RE, ' ').trim();