You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/content/scoring/score-paragraph.js

36 lines
944 B
JavaScript

import {
scoreCommas,
scoreLength,
} from './index'
// Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better.
export default function scoreParagraph(node) {
let score = 1
const text = node.text().trim()
const textLength = text.length
// If this paragraph is less than 25 characters, don't count it.
if (textLength < 25) {
return 0
}
// Add points for any commas within this paragraph
score = score + scoreCommas(text)
// For every 50 characters in this paragraph, add another point. Up
// to 3 points.
score = score + scoreLength(textLength)
// Articles can end with short paragraphs when people are being clever
// but they can also end with short paragraphs setting up lists of junk
// that we strip. This negative tweaks junk setup paragraphs just below
// the cutoff threshold.
if (text.slice(-1) === ':') {
score = score - 1
}
return score
}