From 47ac7e9803f3d9bb5b039db47d054d639c5a27f4 Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Fri, 9 Sep 2016 15:29:07 -0400 Subject: [PATCH] refactor: limiting calls to $ function Squashed commit of the following: commit c72da261cb5319d1eef207bff63b3c9cd49018df Author: Adam Pash Date: Fri Sep 9 15:28:43 2016 -0400 refactor: limiting calls to $ function commit eeae88247d844d5c6acbc529dbc3ce4d14e04191 Author: Adam Pash Date: Fri Sep 9 15:14:33 2016 -0400 refactor: convertNodeTo; requires a cheerio object --- TODO.md | 2 +- .../generic/content/extractor.test.js | 6 ++- .../content/utils/scoring/add-score.js | 1 - .../content/utils/scoring/add-score.test.js | 12 ++--- .../utils/scoring/add-to-parent.test.js | 8 ++-- .../utils/scoring/find-top-candidate.test.js | 20 ++++---- .../utils/scoring/get-or-init-score.js | 10 ++-- .../utils/scoring/get-or-init-score.test.js | 2 +- .../content/utils/scoring/get-score.js | 4 +- .../content/utils/scoring/get-score.test.js | 12 ++--- .../content/utils/scoring/score-content.js | 32 ++++++------- .../utils/scoring/score-content.test.js | 9 ++-- .../content/utils/scoring/score-node.js | 6 +-- .../content/utils/scoring/set-score.js | 6 +-- .../content/utils/scoring/set-score.test.js | 6 +-- src/extractors/root-extractor.js | 4 +- src/utils/dom/clean-h-ones.js | 10 ++-- src/utils/dom/clean-headers.js | 13 +++--- src/utils/dom/clean-images.js | 26 +++++------ src/utils/dom/clean-tags.js | 46 ++++++++++--------- src/utils/dom/convert-node-to.js | 4 +- src/utils/dom/convert-to-paragraphs.js | 10 ++-- src/utils/dom/extract-from-selectors.js | 10 ++-- src/utils/dom/link-density.js | 6 +-- src/utils/dom/node-is-sufficient.js | 4 +- src/utils/dom/paragraphize.js | 6 ++- src/utils/dom/strip-unlikely-candidates.js | 9 ++-- src/utils/dom/within-comment.js | 6 +-- 28 files changed, 148 insertions(+), 142 deletions(-) diff --git a/TODO.md b/TODO.md index de72c031..638d723e 100644 --- a/TODO.md +++ b/TODO.md @@ -3,12 +3,12 @@ TODO: - Rename all cleaners from cleanThing to clean - Make sure weightNodes flag is being passed properly - Get better sense of when cheerio returns a raw node and when a cheerio object - - Remove $ from function calls to getScore - Remove $ whenever possible - Test if .is method is faster than regex methods - Separate constants into activity-specific folders (dom, scoring) DONE: +x Remove $ from function calls to getScore x remove all but attributes whitelist. research what attributes are important beyond SRC and href x remove logic for fetching meta attrs with custom props x cleaning embed and object nodes diff --git a/src/extractors/generic/content/extractor.test.js b/src/extractors/generic/content/extractor.test.js index 2fda4a91..9720f488 100644 --- a/src/extractors/generic/content/extractor.test.js +++ b/src/extractors/generic/content/extractor.test.js @@ -10,11 +10,13 @@ describe('GenericContentExtractor', function() { this.timeout(1000000) describe('extract($, html, opts)', () => { it("extracts html and returns the article", () => { - const html = fs.readFileSync('./fixtures/latimes.html', 'utf-8') + const html = fs.readFileSync('./fixtures/wired.html', 'utf-8') // Array.from(range(1, 100)).map((i) => { // console.log(i) - // clean(GenericContentExtractor.extract(null, html)) + // clean(GenericContentExtractor.extract( + // { $: null, html, url: 'http://example.com' } + // )) // }) const result = clean(GenericContentExtractor.extract( { $: null, html, url: 'http://example.com' } diff --git a/src/extractors/generic/content/utils/scoring/add-score.js b/src/extractors/generic/content/utils/scoring/add-score.js index 4f02e59e..59a8ac41 100644 --- a/src/extractors/generic/content/utils/scoring/add-score.js +++ b/src/extractors/generic/content/utils/scoring/add-score.js @@ -1,5 +1,4 @@ import { - getScore, getOrInitScore, setScore, } from './index' diff --git a/src/extractors/generic/content/utils/scoring/add-score.test.js b/src/extractors/generic/content/utils/scoring/add-score.test.js index a3c20928..e04e97e1 100644 --- a/src/extractors/generic/content/utils/scoring/add-score.test.js +++ b/src/extractors/generic/content/utils/scoring/add-score.test.js @@ -10,18 +10,18 @@ describe('Scoring utils', () => { describe('addScore(node, $, amount)', () => { it(`adds the specified amount to a node's score`, () => { const $ = cheerio.load('

Foo

') - let node = $('p').first() + let $node = $('p').first() - node = addScore(node, $, 25) - assert.equal(getScore(node, $), 50) + $node = addScore($node, $, 25) + assert.equal(getScore($node), 50) }) it(`adds score if score not yet set (assumes score is 0)`, () => { const $ = cheerio.load('

Foo

') - let node = $('p').first() + let $node = $('p').first() - node = addScore(node, $, 25) - assert.equal(getScore(node, $), 25) + $node = addScore($node, $, 25) + assert.equal(getScore($node), 25) }) }) diff --git a/src/extractors/generic/content/utils/scoring/add-to-parent.test.js b/src/extractors/generic/content/utils/scoring/add-to-parent.test.js index 1bf0d0b9..610643bb 100644 --- a/src/extractors/generic/content/utils/scoring/add-to-parent.test.js +++ b/src/extractors/generic/content/utils/scoring/add-to-parent.test.js @@ -11,12 +11,12 @@ describe('Scoring utils', () => { it(`adds 1/4 of a node's score it its parent`, () => { const html = '

Foo

' const $ = cheerio.load(html) - let node = $('p').first() + let $node = $('p').first() - node = addToParent(node, $, 40) + $node = addToParent($node, $, 40) - assert.equal(getScore(node.parent(), $), 35) - assert.equal(getScore(node, $), 40) + assert.equal(getScore($node.parent()), 35) + assert.equal(getScore($node), 40) }) }) diff --git a/src/extractors/generic/content/utils/scoring/find-top-candidate.test.js b/src/extractors/generic/content/utils/scoring/find-top-candidate.test.js index 03f34d15..1903678c 100644 --- a/src/extractors/generic/content/utils/scoring/find-top-candidate.test.js +++ b/src/extractors/generic/content/utils/scoring/find-top-candidate.test.js @@ -14,35 +14,35 @@ describe('findTopCandidate($)', () => { it("finds the top candidate from simple case", () => { const $ = cheerio.load(HTML.findDom1) - const topCandidate = findTopCandidate($) + const $$topCandidate = findTopCandidate($) - assert.equal(getScore(topCandidate), 100) + assert.equal(getScore($$topCandidate), 100) }) it("finds the top candidate from a nested case", () => { const $ = cheerio.load(HTML.findDom2) - const topCandidate = findTopCandidate($) + const $$topCandidate = findTopCandidate($) // this is wrapped in a div so checking // the score of the first child - assert.equal(getScore(topCandidate.children().first()), 50) + assert.equal(getScore($$topCandidate.children().first()), 50) }) it("ignores tags like BR", () => { const $ = cheerio.load(HTML.findDom3) - const topCandidate = findTopCandidate($) + const $topCandidate = findTopCandidate($) - assert.equal(getScore(topCandidate), 50) + assert.equal(getScore($topCandidate), 50) }) it("returns BODY if no candidates found", () => { const $ = cheerio.load(HTML.topBody) - const topCandidate = findTopCandidate($) + const $topCandidate = findTopCandidate($) - assert.equal(topCandidate.get(0).tagName, 'body') + assert.equal($topCandidate.get(0).tagName, 'body') }) it("appends a sibling with a good enough score", () => { @@ -51,8 +51,8 @@ describe('findTopCandidate($)', () => { let $ = cheerio.load(html) $ = scoreContent($) - const topCandidate = findTopCandidate($) - assert.equal($(topCandidate).text().length, 3652) + const $topCandidate = findTopCandidate($) + assert.equal($($topCandidate).text().length, 3652) }) }) diff --git a/src/extractors/generic/content/utils/scoring/get-or-init-score.js b/src/extractors/generic/content/utils/scoring/get-or-init-score.js index a85478d4..8a63f55b 100644 --- a/src/extractors/generic/content/utils/scoring/get-or-init-score.js +++ b/src/extractors/generic/content/utils/scoring/get-or-init-score.js @@ -8,19 +8,19 @@ import { // gets and returns the score if it exists // if not, initializes a score based on // the node's tag type -export default function getOrInitScore(node, $, weightNodes=true) { - let score = getScore(node, $) +export default function getOrInitScore($node, $, weightNodes=true) { + let score = getScore($node) if (score) { return score } else { - score = scoreNode(node) + score = scoreNode($node) if (weightNodes) { - score = score + getWeight(node) + score = score + getWeight($node) } - addToParent(node, $, score) + addToParent($node, $, score) } return score diff --git a/src/extractors/generic/content/utils/scoring/get-or-init-score.test.js b/src/extractors/generic/content/utils/scoring/get-or-init-score.test.js index f847f1d8..f2545cef 100644 --- a/src/extractors/generic/content/utils/scoring/get-or-init-score.test.js +++ b/src/extractors/generic/content/utils/scoring/get-or-init-score.test.js @@ -55,7 +55,7 @@ describe('getOrInitScore(node, $)', () => { const score = getOrInitScore(node, $) - assert.equal(getScore(node.parent(), $), 16) + assert.equal(getScore(node.parent()), 16) }) }) }) diff --git a/src/extractors/generic/content/utils/scoring/get-score.js b/src/extractors/generic/content/utils/scoring/get-score.js index f0d28c63..933a91a0 100644 --- a/src/extractors/generic/content/utils/scoring/get-score.js +++ b/src/extractors/generic/content/utils/scoring/get-score.js @@ -1,6 +1,6 @@ // returns the score of a node based on // the node's score attribute // returns null if no score set -export default function getScore(node, $) { - return parseFloat(node.attr('score')) || null +export default function getScore($node) { + return parseFloat($node.attr('score')) || null } diff --git a/src/extractors/generic/content/utils/scoring/get-score.test.js b/src/extractors/generic/content/utils/scoring/get-score.test.js index 21fddc9f..33774c4b 100644 --- a/src/extractors/generic/content/utils/scoring/get-score.test.js +++ b/src/extractors/generic/content/utils/scoring/get-score.test.js @@ -4,18 +4,18 @@ import cheerio from 'cheerio' import { getScore } from './index' describe('Scoring utils', () => { - describe('getScore(node, $)', () => { + describe('getScore($node)', () => { it("returns null if the node has no score set", () => { const $ = cheerio.load('

Foo

') - const node = $('p').first() - assert.equal(getScore(node, $), null) + const $node = $('p').first() + assert.equal(getScore($node), null) }) it("returns 25 if the node has a score attr of 25", () => { const $ = cheerio.load('

Foo

') - const node = $('p').first() - assert.equal(typeof getScore(node, $), 'number') - assert.equal(getScore(node, $), 25) + const $node = $('p').first() + assert.equal(typeof getScore($node), 'number') + assert.equal(getScore($node), 25) }) }) diff --git a/src/extractors/generic/content/utils/scoring/score-content.js b/src/extractors/generic/content/utils/scoring/score-content.js index 03d931c1..b819656c 100644 --- a/src/extractors/generic/content/utils/scoring/score-content.js +++ b/src/extractors/generic/content/utils/scoring/score-content.js @@ -3,7 +3,6 @@ import { HNEWS_CONTENT_SELECTORS } from '../constants' import { scoreNode, setScore, - getScore, getOrInitScore, addScore, } from './index' @@ -17,7 +16,7 @@ export default function scoreContent($, weightNodes=true) { // First, look for special hNews based selectors and give them a big // boost, if they exist HNEWS_CONTENT_SELECTORS.map(([parentSelector, childSelector]) => { - $(parentSelector).find(childSelector).each((index, node) => { + $(`${parentSelector} ${childSelector}`).each((index, node) => { addScore($(node).parent(parentSelector), $, 80) }) }) @@ -25,37 +24,38 @@ export default function scoreContent($, weightNodes=true) { $('p, pre').each((index, node) => { // The raw score for this paragraph, before we add any parent/child // scores. - const rawScore = scoreNode($(node)) - node = setScore(node, $, getOrInitScore($(node), $, weightNodes)) + let $node = $(node) + const rawScore = scoreNode($node) + $node = setScore($node, $, getOrInitScore($node, $, weightNodes)) // Add the individual content score to the parent node - const parent = $(node).parent() - addScoreTo(parent, $, rawScore, weightNodes) - if (parent) { + const $parent = $node.parent() + addScoreTo($parent, $, rawScore, weightNodes) + if ($parent) { // Add half of the individual content score to the // grandparent - addScoreTo(parent.parent(), $, rawScore/2, weightNodes) + addScoreTo($parent.parent(), $, rawScore/2, weightNodes) } }) return $ } -function convertSpans(node, $) { - if (node.get(0)) { - const { tagName } = node.get(0) +function convertSpans($node, $) { + if ($node.get(0)) { + const { tagName } = $node.get(0) if (tagName === 'span') { // convert spans to divs - convertNodeTo(node, $, 'div') + convertNodeTo($node, $, 'div') } } } -function addScoreTo(node, $, score, weightNodes) { - if (node) { - convertSpans(node, $) - addScore(node, $, score) +function addScoreTo($node, $, score, weightNodes) { + if ($node) { + convertSpans($node, $) + addScore($node, $, score) } } diff --git a/src/extractors/generic/content/utils/scoring/score-content.test.js b/src/extractors/generic/content/utils/scoring/score-content.test.js index 2c9f421c..01625bfa 100644 --- a/src/extractors/generic/content/utils/scoring/score-content.test.js +++ b/src/extractors/generic/content/utils/scoring/score-content.test.js @@ -18,16 +18,14 @@ describe('scoreContent($, weightNodes)', () => { const $ = cheerio.load(HTML.hNews.before) const result = scoreContent($).html() - assert.equal(getScore($('div').first(), $), 140) - // assert.equal(getScore($('div').first(), $), 99) + assert.equal(getScore($('div').first()), 140) }) it("is so-so about non-hNews content", () => { const $ = cheerio.load(HTML.nonHNews.before) const result = scoreContent($).html() - // assert.equal(getScore($('div').first(), $), 38) - assert.equal(getScore($('div').first(), $), 65) + assert.equal(getScore($('div').first()), 65) }) it("scores this Wired article the same", () => { @@ -35,8 +33,7 @@ describe('scoreContent($, weightNodes)', () => { const $ = cheerio.load(html) const result = scoreContent($).html() - // assert.equal(getScore($('article').first(), $), 63.75) - assert.equal(getScore($('article').first(), $), 65.5) + assert.equal(getScore($('article').first()), 65.5) }) }) diff --git a/src/extractors/generic/content/utils/scoring/score-node.js b/src/extractors/generic/content/utils/scoring/score-node.js index fcaede2d..7ddd92d5 100644 --- a/src/extractors/generic/content/utils/scoring/score-node.js +++ b/src/extractors/generic/content/utils/scoring/score-node.js @@ -7,14 +7,14 @@ import { // Score an individual node. Has some smarts for paragraphs, otherwise // just scores based on tag. -export default function scoreNode(node) { - const { tagName } = node.get(0) +export default function scoreNode($node) { + const { tagName } = $node.get(0) // TODO: Consider ordering by most likely. // E.g., if divs are a more common tag on a page, // Could save doing that regex test on every node – AP if (PARAGRAPH_SCORE_TAGS.test(tagName)) { - return scoreParagraph(node) + return scoreParagraph($node) } else if (tagName === 'div') { return 5 } else if (CHILD_CONTENT_TAGS.test(tagName)) { diff --git a/src/extractors/generic/content/utils/scoring/set-score.js b/src/extractors/generic/content/utils/scoring/set-score.js index 916fb449..1b0f74ea 100644 --- a/src/extractors/generic/content/utils/scoring/set-score.js +++ b/src/extractors/generic/content/utils/scoring/set-score.js @@ -1,7 +1,7 @@ -export default function setScore(node, $, score) { - $(node).attr('score', score) - return node +export default function setScore($node, $, score) { + $node.attr('score', score) + return $node } diff --git a/src/extractors/generic/content/utils/scoring/set-score.test.js b/src/extractors/generic/content/utils/scoring/set-score.test.js index d9ab7de1..f4701820 100644 --- a/src/extractors/generic/content/utils/scoring/set-score.test.js +++ b/src/extractors/generic/content/utils/scoring/set-score.test.js @@ -11,12 +11,12 @@ describe('Scoring utils', () => { describe('setScore(node, $, amount)', () => { it("sets the specified amount as the node's score", () => { const $ = cheerio.load('

Foo

') - let node = $('p').first() + let $node = $('p').first() const newScore = 25 - node = setScore(node, $, newScore) + $node = setScore($node, $, newScore) - const score = getScore(node, $) + const score = getScore($node) assert(score, newScore) }) }) diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index ba561346..3d77da7f 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -117,7 +117,7 @@ export function transformElements($content, $, { transforms }) { // If value is a string, convert directly if (typeof value === 'string') { $matches.each((index, node) => { - convertNodeTo(node, $, transforms[key]) + convertNodeTo($(node), $, transforms[key]) }) } else if (typeof value === 'function') { // If value is function, apply function to node @@ -125,7 +125,7 @@ export function transformElements($content, $, { transforms }) { const result = value($(node), $) // If function returns a string, convert node to that value if (typeof result === 'string') { - convertNodeTo(node, $, result) + convertNodeTo($(node), $, result) } }) } diff --git a/src/utils/dom/clean-h-ones.js b/src/utils/dom/clean-h-ones.js index 18e10e26..b6702832 100644 --- a/src/utils/dom/clean-h-ones.js +++ b/src/utils/dom/clean-h-ones.js @@ -5,12 +5,12 @@ import { convertNodeTo } from 'utils/dom' // strip them. Otherwise, turn 'em into H2s. export default function cleanHOnes(article, $) { // const hOnes = $.find('h1') - const hOnes = $('h1', article) - if (hOnes.length < 3) { - hOnes.each((index, node) => $(node).remove()) + const $hOnes = $('h1', article) + if ($hOnes.length < 3) { + $hOnes.each((index, node) => $(node).remove()) } else { - hOnes.each((index, node) => { - convertNodeTo(node, $, 'h2') + $hOnes.each((index, node) => { + convertNodeTo($(node), $, 'h2') }) } diff --git a/src/utils/dom/clean-headers.js b/src/utils/dom/clean-headers.js index ea44f1a6..c5e1ba5b 100644 --- a/src/utils/dom/clean-headers.js +++ b/src/utils/dom/clean-headers.js @@ -2,25 +2,26 @@ import { HEADER_TAG_LIST } from './constants' import { normalizeSpaces } from '../text' import { getWeight } from 'extractors/generic/content/utils/scoring' -export default function cleanHeaders(article, $, title='') { - $(HEADER_TAG_LIST, article).each((index, header) => { +export default function cleanHeaders($article, $, title='') { + $(HEADER_TAG_LIST, $article).each((index, header) => { + const $header = $(header) // Remove any headers that appear before all other p tags in the // document. This probably means that it was part of the title, a // subtitle or something else extraneous like a datestamp or byline, // all of which should be handled by other metadata handling. - if ($(header, article).prevAll('p').length === 0) { - return $(header).remove() + if ($($header, $article).prevAll('p').length === 0) { + return $header.remove() } // Remove any headers that match the title exactly. if (normalizeSpaces($(header).text()) === title) { - return $(header).remove() + return $header.remove() } // If this header has a negative weight, it's probably junk. // Get rid of it. if (getWeight($(header)) < 0) { - return $(header).remove() + return $header.remove() } }) return $ diff --git a/src/utils/dom/clean-images.js b/src/utils/dom/clean-images.js index 13bc553a..2a089f32 100644 --- a/src/utils/dom/clean-images.js +++ b/src/utils/dom/clean-images.js @@ -1,30 +1,30 @@ import { SPACER_RE } from './constants' -export default function cleanImages(article, $) { - $(article).find('img').each((index, img) => { - img = $(img) +export default function cleanImages($article, $) { + $article.find('img').each((index, img) => { + const $img = $(img) - cleanForHeight(img, $) - removeSpacers(img, $) + cleanForHeight($img, $) + removeSpacers($img, $) }) return $ } -function cleanForHeight(img, $) { - const height = parseInt(img.attr('height')) - const width = parseInt(img.attr('width')) || 20 +function cleanForHeight($img, $) { + const height = parseInt($img.attr('height')) + const width = parseInt($img.attr('width')) || 20 // Remove images that explicitly have very small heights or // widths, because they are most likely shims or icons, // which aren't very useful for reading. if ((height || 20) < 10 || width < 10) { - $(img).remove() + $img.remove() } else if (height) { // Don't ever specify a height on images, so that we can // scale with respect to width without screwing up the // aspect ratio. - img.removeAttr('height') + $img.removeAttr('height') } return $ @@ -32,9 +32,9 @@ function cleanForHeight(img, $) { // Cleans out images where the source string matches transparent/spacer/etc // TODO This seems very aggressive - AP -function removeSpacers(img, $) { - if (SPACER_RE.test(img.attr('src'))) { - $(img).remove() +function removeSpacers($img, $) { + if (SPACER_RE.test($img.attr('src'))) { + $img.remove() } return $ diff --git a/src/utils/dom/clean-tags.js b/src/utils/dom/clean-tags.js index 5bb8c984..20d8f1b0 100644 --- a/src/utils/dom/clean-tags.js +++ b/src/utils/dom/clean-tags.js @@ -17,62 +17,63 @@ import { linkDensity } from './index' // etc) // // Return this same doc. -export default function cleanTags(article, $) { - $(CLEAN_CONDITIONALLY_TAGS, article).each((index, node) => { - let weight = getScore($(node)) +export default function cleanTags($article, $) { + $(CLEAN_CONDITIONALLY_TAGS, $article).each((index, node) => { + const $node = $(node) + let weight = getScore($node) if (!weight) { - weight = getOrInitScore($(node), $) - setScore(weight, $) + weight = getOrInitScore($node, $) + setScore($node, $, weight) } // drop node if its weight is < 0 if (weight < 0) { - $(node).remove() + $node.remove() } else { // deteremine if node seems like content - removeUnlessContent(node, $, weight) + removeUnlessContent($node, $, weight) } }) return $ } -function removeUnlessContent(node, $, weight) { +function removeUnlessContent($node, $, weight) { // Explicitly save entry-content-asset tags, which are // noted as valuable in the Publisher guidelines. For now // this works everywhere. We may want to consider making // this less of a sure-thing later. - if ($(node).hasClass('entry-content-asset')) { + if ($node.hasClass('entry-content-asset')) { return } - const content = normalizeSpaces($(node).text()) + const content = normalizeSpaces($node.text()) if (scoreCommas(content) < 10) { - const pCount = $('p', node).length - const inputCount = $('input', node).length + const pCount = $('p', $node).length + const inputCount = $('input', $node).length // Looks like a form, too many inputs. if (inputCount > (pCount / 3)) { - return $(node).remove() + return $node.remove() } const contentLength = content.length - const imgCount = $('img', node).length + const imgCount = $('img', $node).length // Content is too short, and there are no images, so // this is probably junk content. if (contentLength < 25 && imgCount === 0) { - return $(node).remove() + return $node.remove() } - const density = linkDensity($(node)) + const density = linkDensity($node) // Too high of link density, is probably a menu or // something similar. // console.log(weight, density, contentLength) if (weight < 25 && density > 0.2 && contentLength > 75) { - return $(node).remove() + return $node.remove() } // Too high of a link density, despite the score being @@ -81,22 +82,23 @@ function removeUnlessContent(node, $, weight) { // Don't remove the node if it's a list and the // previous sibling starts with a colon though. That // means it's probably content. - const nodeIsList = node.tagName === 'ol' || node.tagName === 'ul' + const tagName = $node.get(0).tagName + const nodeIsList = tagName === 'ol' || tagName === 'ul' if (nodeIsList) { - const previousNode = $(node).prev() + const previousNode = $node.prev() if (previousNode && normalizeSpaces(previousNode.text()).slice(-1) === ':') { return } } - return $(node).remove() + return $node.remove() } - const scriptCount = $('script', node).length + const scriptCount = $('script', $node).length // Too many script tags, not enough content. if (scriptCount > 0 && contentLength < 150) { - return $(node).remove() + return $node.remove() } } } diff --git a/src/utils/dom/convert-node-to.js b/src/utils/dom/convert-node-to.js index e0fd0d65..a654e604 100644 --- a/src/utils/dom/convert-node-to.js +++ b/src/utils/dom/convert-node-to.js @@ -1,4 +1,4 @@ -export default function convertNodeTo(node, $, tag='p') { - $(node).replaceWith(`<${tag}>${$(node).contents()}`) +export default function convertNodeTo($node, $, tag='p') { + $node.replaceWith(`<${tag}>${$node.contents()}`) return $ } diff --git a/src/utils/dom/convert-to-paragraphs.js b/src/utils/dom/convert-to-paragraphs.js index 75276868..b7302755 100644 --- a/src/utils/dom/convert-to-paragraphs.js +++ b/src/utils/dom/convert-to-paragraphs.js @@ -24,10 +24,11 @@ export default function convertToParagraphs($) { function convertDivs($) { $('div').each((index, div) => { - const convertable = $(div).children() + const $div = $(div) + const convertable = $div.children() .not(DIV_TO_P_BLOCK_TAGS).length == 0 if (convertable) { - convertNodeTo(div, $, 'p') + convertNodeTo($div, $, 'p') } }) @@ -36,9 +37,10 @@ function convertDivs($) { function convertSpans($) { $('span').each((index, span) => { - const convertable = $(span).parents('p, div').length == 0 + const $span = $(span) + const convertable = $span.parents('p, div').length == 0 if (convertable) { - convertNodeTo(span, $, 'p') + convertNodeTo($span, $, 'p') } }) diff --git a/src/utils/dom/extract-from-selectors.js b/src/utils/dom/extract-from-selectors.js index 37cbcb36..ebe22ee6 100644 --- a/src/utils/dom/extract-from-selectors.js +++ b/src/utils/dom/extract-from-selectors.js @@ -14,23 +14,23 @@ export default function extractFromSelectors( // If we didn't get exactly one of this selector, this may be // a list of articles or comments. Skip it. if (nodes.length === 1) { - const node = nodes[0] + const $node = $(nodes[0]) // If it has a number of children, it's more likely a container // element. Skip it. - if ($(node).children().length > maxChildren) { + if ($node.children().length > maxChildren) { continue } // If it looks to be within a comment, skip it. - if (withinComment(node, $)) { + if (withinComment($node, $)) { continue } let content if (textOnly) { - content = $(node).text() + content = $node.text() } else { - content = $(node).html() + content = $node.html() } if (content) { diff --git a/src/utils/dom/link-density.js b/src/utils/dom/link-density.js index b05baa9a..2fd87ecc 100644 --- a/src/utils/dom/link-density.js +++ b/src/utils/dom/link-density.js @@ -1,10 +1,10 @@ // Determines what percentage of the text // in a node is link text // Takes a node, returns a float -export function linkDensity(node) { - const totalTextLength = textLength(node.text()) +export function linkDensity($node) { + const totalTextLength = textLength($node.text()) - const linkText = node.find('a').text() + const linkText = $node.find('a').text() const linkLength = textLength(linkText) if (totalTextLength > 0) { diff --git a/src/utils/dom/node-is-sufficient.js b/src/utils/dom/node-is-sufficient.js index 24a36248..517918f1 100644 --- a/src/utils/dom/node-is-sufficient.js +++ b/src/utils/dom/node-is-sufficient.js @@ -2,6 +2,6 @@ // param: node (a cheerio node) // return: boolean -export default function nodeIsSufficient(node) { - return node.text().trim().length >= 100 +export default function nodeIsSufficient($node) { + return $node.text().trim().length >= 100 } diff --git a/src/utils/dom/paragraphize.js b/src/utils/dom/paragraphize.js index c805fb7f..10306005 100644 --- a/src/utils/dom/paragraphize.js +++ b/src/utils/dom/paragraphize.js @@ -12,6 +12,8 @@ import { BLOCK_LEVEL_TAGS_RE } from './constants' // :param br: Whether or not the passed node is a br export default function paragraphize(node, $, br=false) { + const $node = $(node) + if (br) { let sibling = node.nextSibling let p = $('

') @@ -28,8 +30,8 @@ export default function paragraphize(node, $, br=false) { sibling = nextSibling } - $(node).replaceWith(p) - $(node).remove() + $node.replaceWith(p) + $node.remove() return $ } else { // Not currently implemented. May not need to; can leverage diff --git a/src/utils/dom/strip-unlikely-candidates.js b/src/utils/dom/strip-unlikely-candidates.js index e982aef7..1c47eae7 100644 --- a/src/utils/dom/strip-unlikely-candidates.js +++ b/src/utils/dom/strip-unlikely-candidates.js @@ -18,9 +18,10 @@ export default function stripUnlikelyCandidates($) { // // :param $: a cheerio object to strip nodes from // :return $: the cleaned cheerio object - $('*').not('a').each(function(index, element) { - const classes = $(element).attr('class') - const id = $(element).attr('id') + $('*').not('a').each(function(index, node) { + const $node = $(node) + const classes = $node.attr('class') + const id = $node.attr('id') if (!id && !classes) { return } else { @@ -28,7 +29,7 @@ export default function stripUnlikelyCandidates($) { if (CANDIDATES_WHITELIST.test(classAndId)) { return } else if (CANDIDATES_BLACKLIST.test(classAndId)) { - return $(element).remove() + return $node.remove() } } }) diff --git a/src/utils/dom/within-comment.js b/src/utils/dom/within-comment.js index 18ecfaf0..496f9e73 100644 --- a/src/utils/dom/within-comment.js +++ b/src/utils/dom/within-comment.js @@ -1,7 +1,7 @@ -export default function withinComment(node, $) { - const parents = $(node).parents().toArray() +export default function withinComment($node, $) { + const parents = $node.parents().toArray() const commentParent = parents.find((parent) => { - const classAndId = `${$(parent).attr('class')} ${$(parent).attr('id')}` + const classAndId = `${parent.attribs['class']} ${parent.attribs['id']}` return classAndId.includes('comment') })