feat: generic extractor for word count

Squashed commit of the following:

commit 0aba26ef9efba71a72c76fa351a9037e97fc1e9e
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 14 14:56:45 2016 -0400

    fix: normalizeSpaces regex fix broke a test

commit 07d60c1c8c6599d6c94d92e5a70649c28d03d6ea
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 14 14:52:41 2016 -0400

    feat: generic extractor for word count
pull/3/head
Adam Pash 8 years ago
parent 76df30e303
commit daa9266182

@ -9,6 +9,7 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor';
import GenericUrlExtractor from './url/extractor';
import GenericExcerptExtractor from './excerpt/extractor';
import GenericWordCountExtractor from './word-count/extractor';
const GenericExtractor = {
// This extractor is the default for all domains
@ -22,6 +23,7 @@ const GenericExtractor = {
nextPageUrl: GenericNextPageUrlExtractor.extract,
urlAndDomain: GenericUrlExtractor.extract,
excerpt: GenericExcerptExtractor.extract,
wordCount: GenericWordCountExtractor.extract,
extract(options) {
const { html } = options;
@ -39,6 +41,7 @@ const GenericExtractor = {
const dek = this.dek({ ...options, content });
const nextPageUrl = this.nextPageUrl(options);
const excerpt = this.excerpt({ ...options, content });
const wordCount = this.excerpt({ ...options, content });
const { url, domain } = this.urlAndDomain(options);
return {
@ -52,6 +55,7 @@ const GenericExtractor = {
url,
domain,
excerpt,
wordCount,
};
},
};

@ -0,0 +1,14 @@
import cheerio from 'cheerio'
import { normalizeSpaces } from 'utils/text'
const GenericWordCountExtractor = {
extract({ content }) {
const $ = cheerio.load(content)
const text = normalizeSpaces($('div').first().text())
return text.split(/\s/).length
},
}
export default GenericWordCountExtractor

@ -0,0 +1,21 @@
import assert from 'assert'
import GenericWordCountExtractor from './extractor'
describe('GenericWordCountExtractor', () => {
describe('extact({ content })', () => {
it('counts words', () => {
const content = `
<div>
<p>One two three.</p>
<p>Four five six.</p>
<p>Seven eight nine.</p>
<p>Ten eleven twelve.</p>
`
const wordCount = GenericWordCountExtractor.extract({ content })
assert.equal(wordCount, 12)
})
})
})

@ -129,7 +129,9 @@ const RootExtractor = {
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
const dek = extractResult({ ...opts, type: 'dek', content });
const excerpt = extractResult({ ...opts, type: 'excerpt', content });
const wordCount = extractResult({ ...opts, type: 'wordCount', content });
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
return {
title,
content,
@ -141,6 +143,7 @@ const RootExtractor = {
url,
domain,
excerpt,
wordCount,
};
},
};

@ -21,12 +21,14 @@ describe('RootExtractor', () => {
const {
url,
title,
wordCount,
} = RootExtractor.extract(
NYMagExtractor, { url: fullUrl, html, $, metaCache: [] }
);
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation');
assert.equal(url, fullUrl);
assert.equal(wordCount, 727);
});
});

@ -511,6 +511,9 @@ const HTML = {
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,

@ -1,4 +1,4 @@
const NORMALIZE_RE = /\s{2,}/;
const NORMALIZE_RE = /\s{2,}/g;
export default function normalizeSpaces(text) {
return text.replace(NORMALIZE_RE, ' ').trim();

Loading…
Cancel
Save