You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/next-page-url/extractor.js

52 lines
1.2 KiB
JavaScript

import 'babel-polyfill';
import URL from 'url';
import {
articleBaseUrl,
removeAnchor,
} from 'utils/text';
import scoreLinks from './scoring/score-links';
// Looks for and returns next page url
// for multi-page articles
const GenericNextPageUrlExtractor = {
extract({ $, url, parsedUrl, previousUrls = [] }) {
parsedUrl = parsedUrl || URL.parse(url);
const articleUrl = removeAnchor(url);
const baseUrl = articleBaseUrl(url, parsedUrl);
const links = $('a[href]').toArray();
const scoredLinks = scoreLinks({
links,
articleUrl,
baseUrl,
parsedUrl,
$,
previousUrls,
});
// If no links were scored, return null
if (!scoredLinks) return null;
// now that we've scored all possible pages,
// find the biggest one.
const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
const scoredLink = scoredLinks[link];
return scoredLink.score > acc.score ? scoredLink : acc;
}, { score: -100 });
// If the score is less than 50, we're not confident enough to use it,
// so we fail.
if (topPage.score >= 50) {
return topPage.href;
}
return null;
},
};
export default GenericNextPageUrlExtractor;