refactor: limiting calls to $ function

Squashed commit of the following:

commit c72da261cb5319d1eef207bff63b3c9cd49018df
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 9 15:28:43 2016 -0400

    refactor: limiting calls to $ function

commit eeae88247d844d5c6acbc529dbc3ce4d14e04191
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 9 15:14:33 2016 -0400

    refactor: convertNodeTo; requires a cheerio object
pull/1/head
Adam Pash 8 years ago
parent 81e9e7a317
commit 47ac7e9803

@ -3,12 +3,12 @@ TODO:
- Rename all cleaners from cleanThing to clean
- Make sure weightNodes flag is being passed properly
- Get better sense of when cheerio returns a raw node and when a cheerio object
- Remove $ from function calls to getScore
- Remove $ whenever possible
- Test if .is method is faster than regex methods
- Separate constants into activity-specific folders (dom, scoring)
DONE:
x Remove $ from function calls to getScore
x remove all but attributes whitelist. research what attributes are important beyond SRC and href
x remove logic for fetching meta attrs with custom props
x cleaning embed and object nodes

@ -10,11 +10,13 @@ describe('GenericContentExtractor', function() {
this.timeout(1000000)
describe('extract($, html, opts)', () => {
it("extracts html and returns the article", () => {
const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8')
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
// Array.from(range(1, 100)).map((i) => {
// console.log(i)
// clean(GenericContentExtractor.extract(null, html))
// clean(GenericContentExtractor.extract(
// { $: null, html, url: 'http://example.com' }
// ))
// })
const result = clean(GenericContentExtractor.extract(
{ $: null, html, url: 'http://example.com' }

@ -1,5 +1,4 @@
import {
getScore,
getOrInitScore,
setScore,
} from './index'

@ -10,18 +10,18 @@ describe('Scoring utils', () => {
describe('addScore(node, $, amount)', () => {
it(`adds the specified amount to a node's score`, () => {
const $ = cheerio.load('<p score="25">Foo</p>')
let node = $('p').first()
let $node = $('p').first()
node = addScore(node, $, 25)
assert.equal(getScore(node, $), 50)
$node = addScore($node, $, 25)
assert.equal(getScore($node), 50)
})
it(`adds score if score not yet set (assumes score is 0)`, () => {
const $ = cheerio.load('<p>Foo</p>')
let node = $('p').first()
let $node = $('p').first()
node = addScore(node, $, 25)
assert.equal(getScore(node, $), 25)
$node = addScore($node, $, 25)
assert.equal(getScore($node), 25)
})
})

@ -11,12 +11,12 @@ describe('Scoring utils', () => {
it(`adds 1/4 of a node's score it its parent`, () => {
const html = '<div score="25"><p score="40">Foo</p></div>'
const $ = cheerio.load(html)
let node = $('p').first()
let $node = $('p').first()
node = addToParent(node, $, 40)
$node = addToParent($node, $, 40)
assert.equal(getScore(node.parent(), $), 35)
assert.equal(getScore(node, $), 40)
assert.equal(getScore($node.parent()), 35)
assert.equal(getScore($node), 40)
})
})

@ -14,35 +14,35 @@ describe('findTopCandidate($)', () => {
it("finds the top candidate from simple case", () => {
const $ = cheerio.load(HTML.findDom1)
const topCandidate = findTopCandidate($)
const $$topCandidate = findTopCandidate($)
assert.equal(getScore(topCandidate), 100)
assert.equal(getScore($$topCandidate), 100)
})
it("finds the top candidate from a nested case", () => {
const $ = cheerio.load(HTML.findDom2)
const topCandidate = findTopCandidate($)
const $$topCandidate = findTopCandidate($)
// this is wrapped in a div so checking
// the score of the first child
assert.equal(getScore(topCandidate.children().first()), 50)
assert.equal(getScore($$topCandidate.children().first()), 50)
})
it("ignores tags like BR", () => {
const $ = cheerio.load(HTML.findDom3)
const topCandidate = findTopCandidate($)
const $topCandidate = findTopCandidate($)
assert.equal(getScore(topCandidate), 50)
assert.equal(getScore($topCandidate), 50)
})
it("returns BODY if no candidates found", () => {
const $ = cheerio.load(HTML.topBody)
const topCandidate = findTopCandidate($)
const $topCandidate = findTopCandidate($)
assert.equal(topCandidate.get(0).tagName, 'body')
assert.equal($topCandidate.get(0).tagName, 'body')
})
it("appends a sibling with a good enough score", () => {
@ -51,8 +51,8 @@ describe('findTopCandidate($)', () => {
let $ = cheerio.load(html)
$ = scoreContent($)
const topCandidate = findTopCandidate($)
assert.equal($(topCandidate).text().length, 3652)
const $topCandidate = findTopCandidate($)
assert.equal($($topCandidate).text().length, 3652)
})
})

@ -8,19 +8,19 @@ import {
// gets and returns the score if it exists
// if not, initializes a score based on
// the node's tag type
export default function getOrInitScore(node, $, weightNodes=true) {
let score = getScore(node, $)
export default function getOrInitScore($node, $, weightNodes=true) {
let score = getScore($node)
if (score) {
return score
} else {
score = scoreNode(node)
score = scoreNode($node)
if (weightNodes) {
score = score + getWeight(node)
score = score + getWeight($node)
}
addToParent(node, $, score)
addToParent($node, $, score)
}
return score

@ -55,7 +55,7 @@ describe('getOrInitScore(node, $)', () => {
const score = getOrInitScore(node, $)
assert.equal(getScore(node.parent(), $), 16)
assert.equal(getScore(node.parent()), 16)
})
})
})

@ -1,6 +1,6 @@
// returns the score of a node based on
// the node's score attribute
// returns null if no score set
export default function getScore(node, $) {
return parseFloat(node.attr('score')) || null
export default function getScore($node) {
return parseFloat($node.attr('score')) || null
}

@ -4,18 +4,18 @@ import cheerio from 'cheerio'
import { getScore } from './index'
describe('Scoring utils', () => {
describe('getScore(node, $)', () => {
describe('getScore($node)', () => {
it("returns null if the node has no score set", () => {
const $ = cheerio.load('<p>Foo</p>')
const node = $('p').first()
assert.equal(getScore(node, $), null)
const $node = $('p').first()
assert.equal(getScore($node), null)
})
it("returns 25 if the node has a score attr of 25", () => {
const $ = cheerio.load('<p score="25">Foo</p>')
const node = $('p').first()
assert.equal(typeof getScore(node, $), 'number')
assert.equal(getScore(node, $), 25)
const $node = $('p').first()
assert.equal(typeof getScore($node), 'number')
assert.equal(getScore($node), 25)
})
})

@ -3,7 +3,6 @@ import { HNEWS_CONTENT_SELECTORS } from '../constants'
import {
scoreNode,
setScore,
getScore,
getOrInitScore,
addScore,
} from './index'
@ -17,7 +16,7 @@ export default function scoreContent($, weightNodes=true) {
// First, look for special hNews based selectors and give them a big
// boost, if they exist
HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => {
$(parentSelector).find(childSelector).each((index, node) => {
$(`${parentSelector} ${childSelector}`).each((index, node) => {
addScore($(node).parent(parentSelector), $, 80)
})
})
@ -25,37 +24,38 @@ export default function scoreContent($, weightNodes=true) {
$('p, pre').each((index, node) => {
// The raw score for this paragraph, before we add any parent/child
// scores.
const rawScore = scoreNode($(node))
node = setScore(node, $, getOrInitScore($(node), $, weightNodes))
let $node = $(node)
const rawScore = scoreNode($node)
$node = setScore($node, $, getOrInitScore($node, $, weightNodes))
// Add the individual content score to the parent node
const parent = $(node).parent()
addScoreTo(parent, $, rawScore, weightNodes)
if (parent) {
const $parent = $node.parent()
addScoreTo($parent, $, rawScore, weightNodes)
if ($parent) {
// Add half of the individual content score to the
// grandparent
addScoreTo(parent.parent(), $, rawScore/2, weightNodes)
addScoreTo($parent.parent(), $, rawScore/2, weightNodes)
}
})
return $
}
function convertSpans(node, $) {
if (node.get(0)) {
const { tagName } = node.get(0)
function convertSpans($node, $) {
if ($node.get(0)) {
const { tagName } = $node.get(0)
if (tagName === 'span') {
// convert spans to divs
convertNodeTo(node, $, 'div')
convertNodeTo($node, $, 'div')
}
}
}
function addScoreTo(node, $, score, weightNodes) {
if (node) {
convertSpans(node, $)
addScore(node, $, score)
function addScoreTo($node, $, score, weightNodes) {
if ($node) {
convertSpans($node, $)
addScore($node, $, score)
}
}

@ -18,16 +18,14 @@ describe('scoreContent($, weightNodes)', () => {
const $ = cheerio.load(HTML.hNews.before)
const result = scoreContent($).html()
assert.equal(getScore($('div').first(), $), 140)
// assert.equal(getScore($('div').first(), $), 99)
assert.equal(getScore($('div').first()), 140)
})
it("is so-so about non-hNews content", () => {
const $ = cheerio.load(HTML.nonHNews.before)
const result = scoreContent($).html()
// assert.equal(getScore($('div').first(), $), 38)
assert.equal(getScore($('div').first(), $), 65)
assert.equal(getScore($('div').first()), 65)
})
it("scores this Wired article the same", () => {
@ -35,8 +33,7 @@ describe('scoreContent($, weightNodes)', () => {
const $ = cheerio.load(html)
const result = scoreContent($).html()
// assert.equal(getScore($('article').first(), $), 63.75)
assert.equal(getScore($('article').first(), $), 65.5)
assert.equal(getScore($('article').first()), 65.5)
})
})

@ -7,14 +7,14 @@ import {
// Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag.
export default function scoreNode(node) {
const { tagName } = node.get(0)
export default function scoreNode($node) {
const { tagName } = $node.get(0)
// TODO: Consider ordering by most likely.
// E.g., if divs are a more common tag on a page,
// Could save doing that regex test on every node AP
if (PARAGRAPH_SCORE_TAGS.test(tagName)) {
return scoreParagraph(node)
return scoreParagraph($node)
} else if (tagName === 'div') {
return 5
} else if (CHILD_CONTENT_TAGS.test(tagName)) {

@ -1,7 +1,7 @@
export default function setScore(node, $, score) {
$(node).attr('score', score)
return node
export default function setScore($node, $, score) {
$node.attr('score', score)
return $node
}

@ -11,12 +11,12 @@ describe('Scoring utils', () => {
describe('setScore(node, $, amount)', () => {
it("sets the specified amount as the node's score", () => {
const $ = cheerio.load('<p>Foo</p>')
let node = $('p').first()
let $node = $('p').first()
const newScore = 25
node = setScore(node, $, newScore)
$node = setScore($node, $, newScore)
const score = getScore(node, $)
const score = getScore($node)
assert(score, newScore)
})
})

@ -117,7 +117,7 @@ export function transformElements($content, $, { transforms }) {
// If value is a string, convert directly
if (typeof value === 'string') {
$matches.each((index, node) => {
convertNodeTo(node, $, transforms[key])
convertNodeTo($(node), $, transforms[key])
})
} else if (typeof value === 'function') {
// If value is function, apply function to node
@ -125,7 +125,7 @@ export function transformElements($content, $, { transforms }) {
const result = value($(node), $)
// If function returns a string, convert node to that value
if (typeof result === 'string') {
convertNodeTo(node, $, result)
convertNodeTo($(node), $, result)
}
})
}

@ -5,12 +5,12 @@ import { convertNodeTo } from 'utils/dom'
// strip them. Otherwise, turn 'em into H2s.
export default function cleanHOnes(article, $) {
// const hOnes = $.find('h1')
const hOnes = $('h1', article)
if (hOnes.length < 3) {
hOnes.each((index, node) => $(node).remove())
const $hOnes = $('h1', article)
if ($hOnes.length < 3) {
$hOnes.each((index, node) => $(node).remove())
} else {
hOnes.each((index, node) => {
convertNodeTo(node, $, 'h2')
$hOnes.each((index, node) => {
convertNodeTo($(node), $, 'h2')
})
}

@ -2,25 +2,26 @@ import { HEADER_TAG_LIST } from './constants'
import { normalizeSpaces } from '../text'
import { getWeight } from 'extractors/generic/content/utils/scoring'
export default function cleanHeaders(article, $, title='') {
$(HEADER_TAG_LIST, article).each((index, header) => {
export default function cleanHeaders($article, $, title='') {
$(HEADER_TAG_LIST, $article).each((index, header) => {
const $header = $(header)
// Remove any headers that appear before all other p tags in the
// document. This probably means that it was part of the title, a
// subtitle or something else extraneous like a datestamp or byline,
// all of which should be handled by other metadata handling.
if ($(header, article).prevAll('p').length === 0) {
return $(header).remove()
if ($($header, $article).prevAll('p').length === 0) {
return $header.remove()
}
// Remove any headers that match the title exactly.
if (normalizeSpaces($(header).text()) === title) {
return $(header).remove()
return $header.remove()
}
// If this header has a negative weight, it's probably junk.
// Get rid of it.
if (getWeight($(header)) < 0) {
return $(header).remove()
return $header.remove()
}
})
return $

@ -1,30 +1,30 @@
import { SPACER_RE } from './constants'
export default function cleanImages(article, $) {
$(article).find('img').each((index, img) => {
img = $(img)
export default function cleanImages($article, $) {
$article.find('img').each((index, img) => {
const $img = $(img)
cleanForHeight(img, $)
removeSpacers(img, $)
cleanForHeight($img, $)
removeSpacers($img, $)
})
return $
}
function cleanForHeight(img, $) {
const height = parseInt(img.attr('height'))
const width = parseInt(img.attr('width')) || 20
function cleanForHeight($img, $) {
const height = parseInt($img.attr('height'))
const width = parseInt($img.attr('width')) || 20
// Remove images that explicitly have very small heights or
// widths, because they are most likely shims or icons,
// which aren't very useful for reading.
if ((height || 20) < 10 || width < 10) {
$(img).remove()
$img.remove()
} else if (height) {
// Don't ever specify a height on images, so that we can
// scale with respect to width without screwing up the
// aspect ratio.
img.removeAttr('height')
$img.removeAttr('height')
}
return $
@ -32,9 +32,9 @@ function cleanForHeight(img, $) {
// Cleans out images where the source string matches transparent/spacer/etc
// TODO This seems very aggressive - AP
function removeSpacers(img, $) {
if (SPACER_RE.test(img.attr('src'))) {
$(img).remove()
function removeSpacers($img, $) {
if (SPACER_RE.test($img.attr('src'))) {
$img.remove()
}
return $

@ -17,62 +17,63 @@ import { linkDensity } from './index'
// etc)
//
// Return this same doc.
export default function cleanTags(article, $) {
$(CLEAN_CONDITIONALLY_TAGS, article).each((index, node) => {
let weight = getScore($(node))
export default function cleanTags($article, $) {
$(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => {
const $node = $(node)
let weight = getScore($node)
if (!weight) {
weight = getOrInitScore($(node), $)
setScore(weight, $)
weight = getOrInitScore($node, $)
setScore($node, $, weight)
}
// drop node if its weight is < 0
if (weight < 0) {
$(node).remove()
$node.remove()
} else {
// deteremine if node seems like content
removeUnlessContent(node, $, weight)
removeUnlessContent($node, $, weight)
}
})
return $
}
function removeUnlessContent(node, $, weight) {
function removeUnlessContent($node, $, weight) {
// Explicitly save entry-content-asset tags, which are
// noted as valuable in the Publisher guidelines. For now
// this works everywhere. We may want to consider making
// this less of a sure-thing later.
if ($(node).hasClass('entry-content-asset')) {
if ($node.hasClass('entry-content-asset')) {
return
}
const content = normalizeSpaces($(node).text())
const content = normalizeSpaces($node.text())
if (scoreCommas(content) < 10) {
const pCount = $('p', node).length
const inputCount = $('input', node).length
const pCount = $('p', $node).length
const inputCount = $('input', $node).length
// Looks like a form, too many inputs.
if (inputCount > (pCount / 3)) {
return $(node).remove()
return $node.remove()
}
const contentLength = content.length
const imgCount = $('img', node).length
const imgCount = $('img', $node).length
// Content is too short, and there are no images, so
// this is probably junk content.
if (contentLength < 25 && imgCount === 0) {
return $(node).remove()
return $node.remove()
}
const density = linkDensity($(node))
const density = linkDensity($node)
// Too high of link density, is probably a menu or
// something similar.
// console.log(weight, density, contentLength)
if (weight < 25 && density > 0.2 && contentLength > 75) {
return $(node).remove()
return $node.remove()
}
// Too high of a link density, despite the score being
@ -81,22 +82,23 @@ function removeUnlessContent(node, $, weight) {
// Don't remove the node if it's a list and the
// previous sibling starts with a colon though. That
// means it's probably content.
const nodeIsList = node.tagName === 'ol' || node.tagName === 'ul'
const tagName = $node.get(0).tagName
const nodeIsList = tagName === 'ol' || tagName === 'ul'
if (nodeIsList) {
const previousNode = $(node).prev()
const previousNode = $node.prev()
if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') {
return
}
}
return $(node).remove()
return $node.remove()
}
const scriptCount = $('script', node).length
const scriptCount = $('script', $node).length
// Too many script tags, not enough content.
if (scriptCount > 0 && contentLength < 150) {
return $(node).remove()
return $node.remove()
}
}
}

@ -1,4 +1,4 @@
export default function convertNodeTo(node, $, tag='p') {
$(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
export default function convertNodeTo($node, $, tag='p') {
$node.replaceWith(`<${tag}>${$node.contents()}</${tag}>`)
return $
}

@ -24,10 +24,11 @@ export default function convertToParagraphs($) {
function convertDivs($) {
$('div').each((index, div) => {
const convertable = $(div).children()
const $div = $(div)
const convertable = $div.children()
.not(DIV_TO_P_BLOCK_TAGS).length == 0
if (convertable) {
convertNodeTo(div, $, 'p')
convertNodeTo($div, $, 'p')
}
})
@ -36,9 +37,10 @@ function convertDivs($) {
function convertSpans($) {
$('span').each((index, span) => {
const convertable = $(span).parents('p, div').length == 0
const $span = $(span)
const convertable = $span.parents('p, div').length == 0
if (convertable) {
convertNodeTo(span, $, 'p')
convertNodeTo($span, $, 'p')
}
})

@ -14,23 +14,23 @@ export default function extractFromSelectors(
// If we didn't get exactly one of this selector, this may be
// a list of articles or comments. Skip it.
if (nodes.length === 1) {
const node = nodes[0]
const $node = $(nodes[0])
// If it has a number of children, it's more likely a container
// element. Skip it.
if ($(node).children().length > maxChildren) {
if ($node.children().length > maxChildren) {
continue
}
// If it looks to be within a comment, skip it.
if (withinComment(node, $)) {
if (withinComment($node, $)) {
continue
}
let content
if (textOnly) {
content = $(node).text()
content = $node.text()
} else {
content = $(node).html()
content = $node.html()
}
if (content) {

@ -1,10 +1,10 @@
// Determines what percentage of the text
// in a node is link text
// Takes a node, returns a float
export function linkDensity(node) {
const totalTextLength = textLength(node.text())
export function linkDensity($node) {
const totalTextLength = textLength($node.text())
const linkText = node.find('a').text()
const linkText = $node.find('a').text()
const linkLength = textLength(linkText)
if (totalTextLength > 0) {

@ -2,6 +2,6 @@
// param: node (a cheerio node)
// return: boolean
export default function nodeIsSufficient(node) {
return node.text().trim().length >= 100
export default function nodeIsSufficient($node) {
return $node.text().trim().length >= 100
}

@ -12,6 +12,8 @@ import { BLOCK_LEVEL_TAGS_RE } from './constants'
// :param br: Whether or not the passed node is a br
export default function paragraphize(node, $, br=false) {
const $node = $(node)
if (br) {
let sibling = node.nextSibling
let p = $('<p></p>')
@ -28,8 +30,8 @@ export default function paragraphize(node, $, br=false) {
sibling = nextSibling
}
$(node).replaceWith(p)
$(node).remove()
$node.replaceWith(p)
$node.remove()
return $
} else {
// Not currently implemented. May not need to; can leverage

@ -18,9 +18,10 @@ export default function stripUnlikelyCandidates($) {
//
// :param $: a cheerio object to strip nodes from
// :return $: the cleaned cheerio object
$('*').not('a').each(function(index, element) {
const classes = $(element).attr('class')
const id = $(element).attr('id')
$('*').not('a').each(function(index, node) {
const $node = $(node)
const classes = $node.attr('class')
const id = $node.attr('id')
if (!id && !classes) {
return
} else {
@ -28,7 +29,7 @@ export default function stripUnlikelyCandidates($) {
if (CANDIDATES_WHITELIST.test(classAndId)) {
return
} else if (CANDIDATES_BLACKLIST.test(classAndId)) {
return $(element).remove()
return $node.remove()
}
}
})

@ -1,7 +1,7 @@
export default function withinComment(node, $) {
const parents = $(node).parents().toArray()
export default function withinComment($node, $) {
const parents = $node.parents().toArray()
const commentParent = parents.find((parent) => {
const classAndId = `${$(parent).attr('class')} ${$(parent).attr('id')}`
const classAndId = `${parent.attribs['class']} ${parent.attribs['id']}`
return classAndId.includes('comment')
})

Loading…
Cancel
Save