getWeight with tests

pull/1/head
Adam Pash 8 years ago
parent db3b1ec271
commit 89a2cfbb82

@ -5,7 +5,7 @@
"main": "index.js",
"scripts": {
"start": "node ./build",
"test": "mocha --compilers js:babel-register src/**/**/*.test.js"
"test": "mocha --compilers js:babel-register --recursive src/**/*.test.js"
},
"author": "",
"license": "ISC",

@ -747,6 +747,8 @@ export const POSITIVE_SCORE_HINTS = [
// The above list, joined into a matching regular expression
export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
// Readability publisher-specific guidelines
export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i')
// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
@ -864,7 +866,8 @@ export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|
// Does not match:
// pg=102
// page:2
export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
// DISABLING FOR NOW TODO AP
// export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
// Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')

@ -0,0 +1,49 @@
const HTML = {
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
}
export default HTML

@ -0,0 +1,56 @@
import {
NEGATIVE_SCORE_RE,
POSITIVE_SCORE_RE,
PHOTO_HINTS_RE,
READABILITY_ASSET,
} from '../constants'
// Get the score of a node based on its className and id.
export default function getWeight(node) {
const classes = node.attr('class')
const id = node.attr('id')
let score = 0
if (id) {
// if id exists, try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(id)) {
score = score + 25
}
if (NEGATIVE_SCORE_RE.test(id)) {
score = score - 25
}
}
if (classes) {
if (score == 0) {
// if classes exist and id did not contribute to score
// try to score on both positive and negative
if (POSITIVE_SCORE_RE.test(classes)) {
score = score + 25
}
if (NEGATIVE_SCORE_RE.test(classes)) {
score = score - 25
}
}
// even if score has been set by id, add score for
// possible photo matches
// "try to keep photos if we can"
if (PHOTO_HINTS_RE.test(classes)) {
score = score + 10
}
// add 25 if class matches entry-content-asset,
// a class apparently instructed for use in the
// Readability publisher guidelines
// https://www.readability.com/developers/guidelines
if (READABILITY_ASSET.test(classes)) {
score = score + 25
}
}
return score
}

@ -0,0 +1 @@
export { default as getWeight } from './getWeight'

@ -0,0 +1,59 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import {
getWeight
} from './index'
describe('Generic Extractor Utils', () => {
describe('getWeight(node)', () => {
it("returns a score of 25 if node has positive id", () => {
const $ = cheerio.load(HTML.positiveId)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of -25 if node has negative id", () => {
const $ = cheerio.load(HTML.negativeId)
assert.equal(getWeight($('div')), -25)
})
it("returns a score of 25 if node has positive class", () => {
const $ = cheerio.load(HTML.positiveClass)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of -25 if node has negative class", () => {
const $ = cheerio.load(HTML.negativeClass)
assert.equal(getWeight($('div')), -25)
})
it("returns a score of 25 if node has both positive id and class", () => {
const $ = cheerio.load(HTML.positiveIdAndClass)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of 25 if node has pos id and neg class", () => {
// is this really wanted? id="entry" class="adbox"
// should get positive score?
const $ = cheerio.load(HTML.positiveIdNegClass)
assert.equal(getWeight($('div')), 25)
})
it("returns a score of 10 if node has pos img class", () => {
const $ = cheerio.load(HTML.positivePhotoClass)
assert.equal(getWeight($('div')), 10)
})
it("returns a score of 35 if node has pos id pos img class", () => {
const $ = cheerio.load(HTML.positiveIdAndPhoto)
assert.equal(getWeight($('div')), 35)
})
it("adds an add'l 25 (total 50) if node uses entry-content-asset class", () => {
const $ = cheerio.load(HTML.entryContentAsset)
assert.equal(getWeight($('div')), 50)
})
})
})

@ -0,0 +1,18 @@
const HTML = {
tooShort:
`
<div class="foo bar">
<p>This is too short</p>
</div>
`,
longEnough:
`
<div class="foo bar">
<p>
Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean m
</p>
</div>
`
}
export default HTML

@ -1,7 +1,4 @@
// Given a node, determine if it's article-like enough to return
export function nodeIsSufficient(node) {
return node.text().trim().length >= 100
}
// def node_is_sufficient(self, node):
// return (isinstance(node, lxml.html.HtmlElement) and
// len(inner_text(node)) >= 100)
//

@ -1,9 +1,23 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import {
nodeIsSufficient
} from './index'
describe('Utils', () => {
describe('nodeIsSufficient(node)', () => {
it("should return true if text length > 100 chars", () => {
assert.equal(true, true)
it("returns false if node text length < 100 chars", () => {
const $ = cheerio.load(HTML.tooShort)
const sufficient = nodeIsSufficient($.root())
assert.equal(sufficient, false)
})
it("returns true if node text length > 100 chars", () => {
const $ = cheerio.load(HTML.longEnough)
const sufficient = nodeIsSufficient($.root())
assert.equal(sufficient, true)
})
})
})

Loading…
Cancel
Save