getWeight with tests
parent
db3b1ec271
commit
89a2cfbb82
@ -0,0 +1,49 @@
|
||||
const HTML = {
|
||||
positiveId: `
|
||||
<div id="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
negativeId: `
|
||||
<div id="adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveClass: `
|
||||
<div class="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
negativeClass: `
|
||||
<div id="comment ad">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdAndClass: `
|
||||
<div id="article" class="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdNegClass: `
|
||||
<div id="article" class="adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positivePhotoClass: `
|
||||
<div class="figure">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdAndPhoto: `
|
||||
<div id="article" class="figure">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
entryContentAsset: `
|
||||
<div id="foo" class="entry-content-asset">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
}
|
||||
|
||||
export default HTML
|
@ -0,0 +1,56 @@
|
||||
import {
|
||||
NEGATIVE_SCORE_RE,
|
||||
POSITIVE_SCORE_RE,
|
||||
PHOTO_HINTS_RE,
|
||||
READABILITY_ASSET,
|
||||
} from '../constants'
|
||||
|
||||
|
||||
// Get the score of a node based on its className and id.
|
||||
export default function getWeight(node) {
|
||||
const classes = node.attr('class')
|
||||
const id = node.attr('id')
|
||||
let score = 0
|
||||
|
||||
if (id) {
|
||||
// if id exists, try to score on both positive and negative
|
||||
if (POSITIVE_SCORE_RE.test(id)) {
|
||||
score = score + 25
|
||||
}
|
||||
if (NEGATIVE_SCORE_RE.test(id)) {
|
||||
score = score - 25
|
||||
}
|
||||
}
|
||||
|
||||
if (classes) {
|
||||
if (score == 0) {
|
||||
// if classes exist and id did not contribute to score
|
||||
// try to score on both positive and negative
|
||||
if (POSITIVE_SCORE_RE.test(classes)) {
|
||||
score = score + 25
|
||||
}
|
||||
if (NEGATIVE_SCORE_RE.test(classes)) {
|
||||
score = score - 25
|
||||
}
|
||||
}
|
||||
|
||||
// even if score has been set by id, add score for
|
||||
// possible photo matches
|
||||
// "try to keep photos if we can"
|
||||
if (PHOTO_HINTS_RE.test(classes)) {
|
||||
score = score + 10
|
||||
}
|
||||
|
||||
// add 25 if class matches entry-content-asset,
|
||||
// a class apparently instructed for use in the
|
||||
// Readability publisher guidelines
|
||||
// https://www.readability.com/developers/guidelines
|
||||
if (READABILITY_ASSET.test(classes)) {
|
||||
score = score + 25
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
@ -0,0 +1 @@
|
||||
export { default as getWeight } from './getWeight'
|
@ -0,0 +1,59 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import {
|
||||
getWeight
|
||||
} from './index'
|
||||
|
||||
describe('Generic Extractor Utils', () => {
|
||||
describe('getWeight(node)', () => {
|
||||
it("returns a score of 25 if node has positive id", () => {
|
||||
const $ = cheerio.load(HTML.positiveId)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of -25 if node has negative id", () => {
|
||||
const $ = cheerio.load(HTML.negativeId)
|
||||
assert.equal(getWeight($('div')), -25)
|
||||
})
|
||||
|
||||
it("returns a score of 25 if node has positive class", () => {
|
||||
const $ = cheerio.load(HTML.positiveClass)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of -25 if node has negative class", () => {
|
||||
const $ = cheerio.load(HTML.negativeClass)
|
||||
assert.equal(getWeight($('div')), -25)
|
||||
})
|
||||
|
||||
it("returns a score of 25 if node has both positive id and class", () => {
|
||||
const $ = cheerio.load(HTML.positiveIdAndClass)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of 25 if node has pos id and neg class", () => {
|
||||
// is this really wanted? id="entry" class="adbox"
|
||||
// should get positive score?
|
||||
const $ = cheerio.load(HTML.positiveIdNegClass)
|
||||
assert.equal(getWeight($('div')), 25)
|
||||
})
|
||||
|
||||
it("returns a score of 10 if node has pos img class", () => {
|
||||
const $ = cheerio.load(HTML.positivePhotoClass)
|
||||
assert.equal(getWeight($('div')), 10)
|
||||
})
|
||||
|
||||
it("returns a score of 35 if node has pos id pos img class", () => {
|
||||
const $ = cheerio.load(HTML.positiveIdAndPhoto)
|
||||
assert.equal(getWeight($('div')), 35)
|
||||
})
|
||||
|
||||
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
|
||||
const $ = cheerio.load(HTML.entryContentAsset)
|
||||
assert.equal(getWeight($('div')), 50)
|
||||
})
|
||||
|
||||
})
|
||||
})
|
@ -0,0 +1,18 @@
|
||||
const HTML = {
|
||||
tooShort:
|
||||
`
|
||||
<div class="foo bar">
|
||||
<p>This is too short</p>
|
||||
</div>
|
||||
`,
|
||||
longEnough:
|
||||
`
|
||||
<div class="foo bar">
|
||||
<p>
|
||||
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
|
||||
</p>
|
||||
</div>
|
||||
`
|
||||
}
|
||||
|
||||
export default HTML
|
@ -1,9 +1,23 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
import {
|
||||
nodeIsSufficient
|
||||
} from './index'
|
||||
|
||||
describe('Utils', () => {
|
||||
describe('nodeIsSufficient(node)', () => {
|
||||
it("should return true if text length > 100 chars", () => {
|
||||
assert.equal(true, true)
|
||||
it("returns false if node text length < 100 chars", () => {
|
||||
const $ = cheerio.load(HTML.tooShort)
|
||||
const sufficient = nodeIsSufficient($.root())
|
||||
assert.equal(sufficient, false)
|
||||
})
|
||||
|
||||
it("returns true if node text length > 100 chars", () => {
|
||||
const $ = cheerio.load(HTML.longEnough)
|
||||
const sufficient = nodeIsSufficient($.root())
|
||||
assert.equal(sufficient, true)
|
||||
})
|
||||
})
|
||||
})
|
||||
|
Loading…
Reference in New Issue