You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
84 lines
2.5 KiB
JavaScript
84 lines
2.5 KiB
JavaScript
import {
|
|
textLength,
|
|
linkDensity,
|
|
} from 'utils/dom';
|
|
import { hasSentenceEnd } from 'utils/text';
|
|
|
|
import { NON_TOP_CANDIDATE_TAGS_RE } from './constants';
|
|
import { getScore } from './index';
|
|
|
|
// Now that we have a top_candidate, look through the siblings of
|
|
// it to see if any of them are decently scored. If they are, they
|
|
// may be split parts of the content (Like two divs, a preamble and
|
|
// a body.) Example:
|
|
// http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
|
|
export default function mergeSiblings($candidate, topScore, $) {
|
|
if (!$candidate.parent().length) {
|
|
return $candidate;
|
|
}
|
|
|
|
const siblingScoreThreshold = Math.max(10, topScore * 0.25);
|
|
const wrappingDiv = $('<div></div>');
|
|
|
|
$candidate.parent().children().each((index, sibling) => {
|
|
const $sibling = $(sibling);
|
|
// Ignore tags like BR, HR, etc
|
|
if (NON_TOP_CANDIDATE_TAGS_RE.test(sibling.tagName)) {
|
|
return null;
|
|
}
|
|
|
|
const siblingScore = getScore($sibling);
|
|
if (siblingScore) {
|
|
if ($sibling.get(0) === $candidate.get(0)) {
|
|
wrappingDiv.append($sibling);
|
|
} else {
|
|
let contentBonus = 0;
|
|
const density = linkDensity($sibling);
|
|
|
|
// If sibling has a very low link density,
|
|
// give it a small bonus
|
|
if (density < 0.05) {
|
|
contentBonus += 20;
|
|
}
|
|
|
|
// If sibling has a high link density,
|
|
// give it a penalty
|
|
if (density >= 0.5) {
|
|
contentBonus -= 20;
|
|
}
|
|
|
|
// If sibling node has the same class as
|
|
// candidate, give it a bonus
|
|
if ($sibling.attr('class') === $candidate.attr('class')) {
|
|
contentBonus += topScore * 0.2;
|
|
}
|
|
|
|
const newScore = siblingScore + contentBonus;
|
|
|
|
if (newScore >= siblingScoreThreshold) {
|
|
return wrappingDiv.append($sibling);
|
|
} else if (sibling.tagName === 'p') {
|
|
const siblingContent = $sibling.text();
|
|
const siblingContentLength = textLength(siblingContent);
|
|
|
|
if (siblingContentLength > 80 && density < 0.25) {
|
|
return wrappingDiv.append($sibling);
|
|
} else if (siblingContentLength <= 80 && density === 0 &&
|
|
hasSentenceEnd(siblingContent)) {
|
|
return wrappingDiv.append($sibling);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
if (wrappingDiv.children().length === 1 &&
|
|
wrappingDiv.children().first().get(0) === $candidate.get(0)) {
|
|
return $candidate;
|
|
}
|
|
|
|
return wrappingDiv;
|
|
}
|