feat: nextPageUrl handles multi-page articles
Squashed commit of the following: commit b5070c0967a7f1a0c0c449ba7ea40aebe8fe4bb8 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 13 10:03:00 2016 -0400 root extractor includes next page url commit 79be83127d5342d89eef33665586fabea227d6b3 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 13 09:58:20 2016 -0400 small score adjustment commit 0f00507dbff43401145a892e849311518edec68a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 12 18:17:38 2016 -0400 feat: nextPageUrl generic parser up and running commit be91c589fc0c6d6f9b573080a76c9b1ac7af710c Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 12 11:53:58 2016 -0400 feat: pageNumFromUrl extracts the pagenum of the current url commit ad879d7aabedadfd051c01b42d841703bf4763fa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 12 11:52:37 2016 -0400 feat: isWordpress checks if a page is generated by wordpresspull/1/head
parent
a89b9b785e
commit
7ec0ed0d31
File diff suppressed because one or more lines are too long
@ -0,0 +1,54 @@
|
||||
import 'babel-polyfill'
|
||||
import URL from 'url'
|
||||
|
||||
import {
|
||||
pageNumFromUrl,
|
||||
articleBaseUrl,
|
||||
removeAnchor,
|
||||
} from 'utils/text'
|
||||
import scoreLinks from './scoring/score-links'
|
||||
|
||||
// Looks for and returns next page url
|
||||
// for multi-page articles
|
||||
const GenericNextPageUrlExtractor = {
|
||||
extract({ $, url, parsedUrl, previousUrls=[] }) {
|
||||
parsedUrl = parsedUrl || URL.parse(url)
|
||||
|
||||
const currentPageNum = pageNumFromUrl(url)
|
||||
const articleUrl = removeAnchor(url)
|
||||
const baseUrl = articleBaseUrl(url, parsedUrl)
|
||||
const { host } = parsedUrl
|
||||
|
||||
const links = $('a[href]').toArray()
|
||||
|
||||
const scoredLinks = scoreLinks({
|
||||
links,
|
||||
articleUrl,
|
||||
baseUrl,
|
||||
parsedUrl,
|
||||
$,
|
||||
previousUrls
|
||||
})
|
||||
|
||||
// If no links were scored, return null
|
||||
if (!scoredLinks) return null
|
||||
|
||||
// now that we've scored all possible pages,
|
||||
// find the biggest one.
|
||||
const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
|
||||
const scoredLink = scoredLinks[link]
|
||||
return scoredLink.score > acc.score ? scoredLink : acc
|
||||
}, { score: -100 })
|
||||
|
||||
// If the score is less than 50, we're not confident enough to use it,
|
||||
// so we fail.
|
||||
if (topPage.score >= 50) {
|
||||
return topPage.href
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export default GenericNextPageUrlExtractor
|
@ -0,0 +1,34 @@
|
||||
import assert from 'assert'
|
||||
import fs from 'fs'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import GenericNextPageUrlExtractor from './extractor'
|
||||
|
||||
describe('GenericNextPageUrlExtractor', () => {
|
||||
it('returns most likely next page url', () => {
|
||||
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
|
||||
const $ = cheerio.load(html)
|
||||
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
|
||||
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'
|
||||
|
||||
const nextPage = GenericNextPageUrlExtractor.extract({
|
||||
$,
|
||||
url
|
||||
})
|
||||
|
||||
assert.equal(nextPage, next)
|
||||
})
|
||||
|
||||
it('returns null if there is no likely next page', () => {
|
||||
const html = `<div><p>HI</p></div>`
|
||||
const $ = cheerio.load(html)
|
||||
const url = 'http://example.com/foo/bar'
|
||||
|
||||
const nextPage = GenericNextPageUrlExtractor.extract({
|
||||
$,
|
||||
url
|
||||
})
|
||||
|
||||
assert.equal(nextPage, null)
|
||||
})
|
||||
})
|
@ -0,0 +1,38 @@
|
||||
export const DIGIT_RE = /\d/
|
||||
|
||||
// A list of words that, if found in link text or URLs, likely mean that
|
||||
// this link is not a next page link.
|
||||
export const EXTRANEOUS_LINK_HINTS = [
|
||||
'print',
|
||||
'archive',
|
||||
'comment',
|
||||
'discuss',
|
||||
'e-mail',
|
||||
'email',
|
||||
'share',
|
||||
'reply',
|
||||
'all',
|
||||
'login',
|
||||
'sign',
|
||||
'single',
|
||||
'adx',
|
||||
'entry-unrelated'
|
||||
]
|
||||
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
|
||||
|
||||
// Match any link text/classname/id that looks like it could mean the next
|
||||
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
|
||||
// mean last page.
|
||||
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
|
||||
|
||||
// Match any link text/classname/id that looks like it is an end link: things
|
||||
// like "first", "last", "end", etc.
|
||||
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
|
||||
|
||||
// Match any link text/classname/id that looks like it means the previous
|
||||
// page.
|
||||
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
|
||||
|
||||
// Match any phrase that looks like it could be page, or paging, or pagination
|
||||
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
|
||||
|
@ -0,0 +1,301 @@
|
||||
import 'babel-polyfill'
|
||||
import URL from 'url'
|
||||
import difflib from 'difflib'
|
||||
|
||||
import { range } from 'utils'
|
||||
import { isWordpress } from 'utils/dom'
|
||||
import {
|
||||
removeAnchor,
|
||||
pageNumFromUrl,
|
||||
} from 'utils/text'
|
||||
import {
|
||||
DIGIT_RE,
|
||||
NEXT_LINK_TEXT_RE,
|
||||
PREV_LINK_TEXT_RE,
|
||||
EXTRANEOUS_LINK_HINTS_RE,
|
||||
CAP_LINK_TEXT_RE,
|
||||
PAGE_RE,
|
||||
} from './constants'
|
||||
|
||||
import {
|
||||
NEGATIVE_SCORE_RE,
|
||||
POSITIVE_SCORE_RE,
|
||||
} from 'utils/dom/constants'
|
||||
import { IS_DIGIT_RE } from 'utils/text/constants'
|
||||
|
||||
export default function scoreLinks({
|
||||
links,
|
||||
articleUrl,
|
||||
baseUrl,
|
||||
parsedUrl,
|
||||
$,
|
||||
previousUrls=[]
|
||||
}) {
|
||||
parsedUrl = parsedUrl || URL.parse(articleUrl)
|
||||
const baseRegex = makeBaseRegex(baseUrl)
|
||||
const isWp = isWordpress($)
|
||||
|
||||
// Loop through all links, looking for hints that they may be next-page
|
||||
// links. Things like having "page" in their textContent, className or
|
||||
// id, or being a child of a node with a page-y className or id.
|
||||
//
|
||||
// After we do that, assign each page a score, and pick the one that
|
||||
// looks most like the next page link, as long as its score is strong
|
||||
// enough to have decent confidence.
|
||||
const scoredPages = links.reduce((possiblePages, link) => {
|
||||
// Remove any anchor data since we don't do a good job
|
||||
// standardizing URLs (it's hard), we're going to do
|
||||
// some checking with and without a trailing slash
|
||||
let href = removeAnchor(link.attribs.href)
|
||||
const $link = $(link)
|
||||
const linkText = $link.text()
|
||||
|
||||
if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
|
||||
return possiblePages
|
||||
}
|
||||
|
||||
// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
|
||||
if (!possiblePages[href]) {
|
||||
possiblePages[href] = {
|
||||
score: 0,
|
||||
linkText,
|
||||
href,
|
||||
}
|
||||
} else {
|
||||
possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`
|
||||
}
|
||||
|
||||
const possiblePage = possiblePages[href]
|
||||
const linkData = makeSig($link, linkText)
|
||||
const pageNum = pageNumFromUrl(href)
|
||||
|
||||
let score = scoreBaseUrl(href, baseRegex)
|
||||
score = score + scoreNextLinkText(linkData)
|
||||
score = score + scoreCapLinks(linkData)
|
||||
score = score + scorePrevLink(linkData)
|
||||
score = score + scoreByParents($link)
|
||||
score = score + scoreExtraneousLinks(href)
|
||||
score = score + scorePageInLink(pageNum, isWp)
|
||||
score = score + scoreLinkText(linkText, pageNum)
|
||||
score = score + scoreSimilarity(score, articleUrl, href)
|
||||
|
||||
possiblePage.score = score
|
||||
|
||||
return possiblePages
|
||||
}, {})
|
||||
|
||||
return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages
|
||||
}
|
||||
|
||||
export function makeBaseRegex(baseUrl) {
|
||||
return new RegExp(`^${baseUrl}`, 'i')
|
||||
}
|
||||
|
||||
export function scoreSimilarity(score, articleUrl, href) {
|
||||
// Do this last and only if we have a real candidate, because it's
|
||||
// potentially expensive computationally. Compare the link to this
|
||||
// URL using difflib to get the % similarity of these URLs. On a
|
||||
// sliding scale, subtract points from this link based on
|
||||
// similarity.
|
||||
if (score > 0) {
|
||||
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio()
|
||||
// Subtract .1 from diff_percent when calculating modifier,
|
||||
// which means that if it's less than 10% different, we give a
|
||||
// bonus instead. Ex:
|
||||
// 3% different = +17.5 points
|
||||
// 10% different = 0 points
|
||||
// 20% different = -25 points
|
||||
const diffPercent = 1.0 - similarity
|
||||
const diffModifier = -(250 * (diffPercent - 0.2))
|
||||
return score + diffModifier
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function scoreLinkText(linkText, pageNum) {
|
||||
// If the link text can be parsed as a number, give it a minor
|
||||
// bonus, with a slight bias towards lower numbered pages. This is
|
||||
// so that pages that might not have 'next' in their text can still
|
||||
// get scored, and sorted properly by score.
|
||||
let score = 0
|
||||
|
||||
if (IS_DIGIT_RE.test(linkText.trim())) {
|
||||
const linkTextAsNum = parseInt(linkText)
|
||||
// If it's the first page, we already got it on the first call.
|
||||
// Give it a negative score. Otherwise, up to page 10, give a
|
||||
// small bonus.
|
||||
if (linkTextAsNum < 2) {
|
||||
score = -30
|
||||
} else {
|
||||
score = Math.max(0, 10 - linkTextAsNum)
|
||||
}
|
||||
|
||||
// If it appears that the current page number is greater than
|
||||
// this links page number, it's a very bad sign. Give it a big
|
||||
// penalty.
|
||||
if (pageNum && pageNum >= linkTextAsNum) {
|
||||
score = score - 50
|
||||
}
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
export function scorePageInLink(pageNum, isWp) {
|
||||
// page in the link = bonus. Intentionally ignore wordpress because
|
||||
// their ?p=123 link style gets caught by this even though it means
|
||||
// separate documents entirely.
|
||||
if (pageNum && !isWp) {
|
||||
return 50
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function scoreExtraneousLinks(href) {
|
||||
// If the URL itself contains extraneous values, give a penalty.
|
||||
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
|
||||
return -25
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function scoreByParents($link) {
|
||||
// If a parent node contains paging-like classname or id, give a
|
||||
// bonus. Additionally, if a parent_node contains bad content
|
||||
// (like 'sponsor'), give a penalty.
|
||||
let $parent = $link.parent()
|
||||
let positiveMatch = false
|
||||
let negativeMatch = false
|
||||
let score = 0
|
||||
|
||||
Array.from(range(0, 4)).forEach((_) => {
|
||||
if ($parent.length === 0) {
|
||||
return
|
||||
}
|
||||
|
||||
const parentData = makeSig($parent, ' ')
|
||||
|
||||
// If we have 'page' or 'paging' in our data, that's a good
|
||||
// sign. Add a bonus.
|
||||
if (!positiveMatch && PAGE_RE.test(parentData)) {
|
||||
positiveMatch = true
|
||||
score = score + 25
|
||||
}
|
||||
|
||||
// If we have 'comment' or something in our data, and
|
||||
// we don't have something like 'content' as well, that's
|
||||
// a bad sign. Give a penalty.
|
||||
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
|
||||
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
|
||||
if (!POSITIVE_SCORE_RE.test(parentData)) {
|
||||
negativeMatch = true
|
||||
score = score - 25
|
||||
}
|
||||
}
|
||||
|
||||
$parent = $parent.parent()
|
||||
})
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
export function scorePrevLink(linkData) {
|
||||
// If the link has something like "previous", its definitely
|
||||
// an old link, skip it.
|
||||
if (PREV_LINK_TEXT_RE.test(linkData)) {
|
||||
return -200
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function scoreCapLinks(linkData) {
|
||||
// Cap links are links like "last", etc.
|
||||
if (CAP_LINK_TEXT_RE.test(linkData)) {
|
||||
// If we found a link like "last", but we've already seen that
|
||||
// this link is also "next", it's fine. If it's not been
|
||||
// previously marked as "next", then it's probably bad.
|
||||
// Penalize.
|
||||
if (NEXT_LINK_TEXT_RE.test(linkData)) {
|
||||
return -65
|
||||
}
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function scoreNextLinkText(linkData) {
|
||||
// Things like "next", ">>", etc.
|
||||
if (NEXT_LINK_TEXT_RE.test(linkData)) {
|
||||
return 50
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function scoreBaseUrl(href, baseRegex) {
|
||||
// If the baseUrl isn't part of this URL, penalize this
|
||||
// link. It could still be the link, but the odds are lower.
|
||||
// Example:
|
||||
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
|
||||
if (!baseRegex.test(href)) {
|
||||
return -25
|
||||
}
|
||||
|
||||
return 0
|
||||
}
|
||||
|
||||
export function shouldScore(
|
||||
href,
|
||||
articleUrl,
|
||||
baseUrl,
|
||||
parsedUrl,
|
||||
linkText,
|
||||
previousUrls
|
||||
) {
|
||||
// skip if we've already fetched this url
|
||||
if(previousUrls.find((url) => href === url) !== undefined) {
|
||||
return false
|
||||
}
|
||||
|
||||
// If we've already parsed this URL, or the URL matches the base
|
||||
// URL, or is empty, skip it.
|
||||
if (!href || href === articleUrl || href === baseUrl) {
|
||||
return false
|
||||
}
|
||||
|
||||
const { hostname } = parsedUrl
|
||||
const { hostname: linkHost } = URL.parse(href)
|
||||
|
||||
// Domain mismatch.
|
||||
if (linkHost !== hostname) {
|
||||
return false
|
||||
}
|
||||
|
||||
// If href doesn't contain a digit after removing the base URL,
|
||||
// it's certainly not the next page.
|
||||
const fragment = href.replace(baseUrl, '')
|
||||
if (!DIGIT_RE.test(fragment)) {
|
||||
return false
|
||||
}
|
||||
|
||||
// This link has extraneous content (like "comment") in its link
|
||||
// text, so we skip it.
|
||||
if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) {
|
||||
return false
|
||||
}
|
||||
|
||||
// Next page link text is never long, skip if it is too long.
|
||||
if (linkText.length > 25) {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
function makeSig($link, linkText) {
|
||||
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`
|
||||
}
|
@ -0,0 +1,239 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import fs from 'fs'
|
||||
import URL from 'url'
|
||||
|
||||
import scoreLinks from './score-links'
|
||||
import {
|
||||
makeBaseRegex,
|
||||
scoreBaseUrl,
|
||||
scoreNextLinkText,
|
||||
scoreCapLinks,
|
||||
scorePrevLink,
|
||||
scoreByParents,
|
||||
scoreExtraneousLinks,
|
||||
scorePageInLink,
|
||||
scoreLinkText,
|
||||
scoreSimilarity,
|
||||
shouldScore,
|
||||
} from './score-links'
|
||||
|
||||
describe('scoreLinks(links)', () => {
|
||||
it('returns an object of scored links', () => {
|
||||
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
|
||||
|
||||
const $ = cheerio.load(html)
|
||||
const links = $('a[href]').toArray()
|
||||
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
|
||||
|
||||
const scoredPages = scoreLinks({
|
||||
links,
|
||||
articleUrl: url,
|
||||
baseUrl: 'http://arstechnica.com',
|
||||
$,
|
||||
})
|
||||
|
||||
assert.equal(typeof scoredPages, 'object')
|
||||
})
|
||||
|
||||
it('returns null if no possible pages', () => {
|
||||
const html = `<div><p>Hello wow</p></div>`
|
||||
|
||||
const $ = cheerio.load(html)
|
||||
const links = $('a[href]').toArray()
|
||||
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
|
||||
|
||||
const scoredPages = scoreLinks({
|
||||
links,
|
||||
articleUrl: url,
|
||||
baseUrl: 'http://arstechnica.com',
|
||||
$,
|
||||
})
|
||||
|
||||
assert.equal(scoredPages, null)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreBaseUrl(href, baseRegex)', () => {
|
||||
it('returns -25 if url does not contain the base url', () => {
|
||||
const baseUrl = 'http://example.com/foo/bar'
|
||||
const badUrl = 'http://foo.com/foo/bar'
|
||||
const baseRegex = makeBaseRegex(baseUrl)
|
||||
|
||||
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25)
|
||||
})
|
||||
|
||||
it('returns 0 if url contains the base url', () => {
|
||||
const baseUrl = 'http://example.com/foo/bar'
|
||||
const badUrl = 'http://example.com/foo/bar/bat'
|
||||
const baseRegex = makeBaseRegex(baseUrl)
|
||||
|
||||
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreNextLinkText(linkData)', () => {
|
||||
it('returns 50 if contains common next link text', () => {
|
||||
const linkData = "foo bar Next page"
|
||||
|
||||
assert.equal(scoreNextLinkText(linkData), 50)
|
||||
})
|
||||
|
||||
it('returns 0 if does not contain common next link text', () => {
|
||||
const linkData = "foo bar WOW GREAT"
|
||||
|
||||
assert.equal(scoreNextLinkText(linkData), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreCapLinks(linkData)', () => {
|
||||
it('returns -65 if cap link with next link text', () => {
|
||||
const linkData = "foo next Last page"
|
||||
|
||||
assert.equal(scoreCapLinks(linkData), -65)
|
||||
})
|
||||
|
||||
it('returns 0 if does not match a cap link', () => {
|
||||
const linkData = "foo bar WOW GREAT"
|
||||
|
||||
assert.equal(scoreCapLinks(linkData), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scorePrevLink(linkData)', () => {
|
||||
it('returns -200 if link matches previous text', () => {
|
||||
const linkData = "foo next previous page"
|
||||
|
||||
assert.equal(scorePrevLink(linkData), -200)
|
||||
})
|
||||
|
||||
it('returns 0 if does not match a prev link', () => {
|
||||
const linkData = "foo bar WOW GREAT"
|
||||
|
||||
assert.equal(scoreCapLinks(linkData), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreByParents($link)', () => {
|
||||
it('returns 25 if parent sig looks like a page', () => {
|
||||
const html = `
|
||||
<div>
|
||||
<div class="next-page">
|
||||
<a href="blah">Next page</a>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
const $ = cheerio.load(html)
|
||||
const $link = $('a').first()
|
||||
|
||||
assert.equal(scoreByParents($link), 25)
|
||||
})
|
||||
|
||||
it('returns -25 if parent sig looks like a comment', () => {
|
||||
const html = `
|
||||
<div>
|
||||
<div class="comment">
|
||||
<a href="blah">Next page</a>
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
const $ = cheerio.load(html)
|
||||
const $link = $('a').first()
|
||||
|
||||
assert.equal(scoreByParents($link), -25)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
describe('scoreExtraneousLinks(href)', () => {
|
||||
it('returns -25 if link matches extraneous text', () => {
|
||||
const url = "http://example.com/email-link"
|
||||
|
||||
assert.equal(scoreExtraneousLinks(url), -25)
|
||||
})
|
||||
|
||||
it('returns 0 if does not match extraneous text', () => {
|
||||
const url = "http://example.com/asdf"
|
||||
|
||||
assert.equal(scoreExtraneousLinks(url), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scorePageInLink(pageNum, isWp)', () => {
|
||||
it('returns 50 if link contains a page num', () => {
|
||||
assert.equal(scorePageInLink(1, false), 50)
|
||||
})
|
||||
|
||||
it('returns 0 if link contains no page num', () => {
|
||||
assert.equal(scorePageInLink(null, false), 0)
|
||||
})
|
||||
|
||||
it('returns 0 if page is wordpress', () => {
|
||||
assert.equal(scorePageInLink(10, true), 0)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
describe('scoreLinkText(linkText)', () => {
|
||||
it('returns 8 if link contains the num 2', () => {
|
||||
assert.equal(scoreLinkText('2', 0), 8)
|
||||
})
|
||||
|
||||
it('returns 5 if link contains the num 5', () => {
|
||||
assert.equal(scoreLinkText('5', 0), 5)
|
||||
})
|
||||
|
||||
it('returns -30 if link contains the number 1', () => {
|
||||
assert.equal(scoreLinkText('1', 0), -30)
|
||||
})
|
||||
|
||||
it('penalizes -50 if pageNum is >= link text as num', () => {
|
||||
assert.equal(scoreLinkText('4', 5), -44)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
describe('scoreSimilarity(score, articleUrl, href)', () => {
|
||||
it('returns a similarity bonus based on current score', () => {
|
||||
const articleUrl = 'http://example.com/foo/bar'
|
||||
const href = 'http://example.com/foo/bar/2'
|
||||
const score = 25
|
||||
assert.equal(
|
||||
Math.round(scoreSimilarity(score, articleUrl, href)),
|
||||
66
|
||||
)
|
||||
})
|
||||
|
||||
it('returns 0 is current score <= 0', () => {
|
||||
const articleUrl = 'http://example.com/foo/bar'
|
||||
const href = 'http://example.com/foo/bar/2'
|
||||
const score = 0
|
||||
assert.equal(scoreSimilarity(score, articleUrl, href), 0)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
|
||||
it('returns false if href has already been fetched', () => {
|
||||
const previousUrls = [ 'http://example.com/foo/bar/2' ]
|
||||
const href = 'http://example.com/foo/bar/2'
|
||||
const parsedUrl = URL.parse(href)
|
||||
|
||||
assert.equal(
|
||||
shouldScore(href, '', '', parsedUrl, '', previousUrls),
|
||||
false
|
||||
)
|
||||
})
|
||||
|
||||
it('returns true if href has not been fetched', () => {
|
||||
const previousUrls = [ 'http://example.com/foo/bar' ]
|
||||
const href = 'http://example.com/foo/bar/2'
|
||||
const parsedUrl = URL.parse(href)
|
||||
|
||||
assert.equal(
|
||||
shouldScore(href, '', '', parsedUrl, '', previousUrls),
|
||||
true
|
||||
)
|
||||
})
|
||||
|
||||
})
|
@ -0,0 +1,5 @@
|
||||
import { IS_WP_SELECTOR } from './constants'
|
||||
|
||||
export default function isWordpress($) {
|
||||
return $(IS_WP_SELECTOR).length > 0
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import isWordpress from './is-wordpress'
|
||||
|
||||
describe('isWordpress($)', () => {
|
||||
it('returns false if a site is not generated by wordpress', () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="generator" value="whatever">
|
||||
<head>
|
||||
</html>
|
||||
`
|
||||
let $ = cheerio.load(html)
|
||||
|
||||
assert.equal(isWordpress($), false)
|
||||
|
||||
const html2 = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="foo" value="bar">
|
||||
<head>
|
||||
</html>
|
||||
`
|
||||
$ = cheerio.load(html)
|
||||
|
||||
assert.equal(isWordpress($), false)
|
||||
})
|
||||
|
||||
it('returns true if a site is generated by wordpress', () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="generator" value="WordPress 4.7-alpha-38592">
|
||||
<head>
|
||||
</html>
|
||||
`
|
||||
const $ = cheerio.load(html)
|
||||
|
||||
assert.equal(isWordpress($), true)
|
||||
})
|
||||
})
|
@ -0,0 +1 @@
|
||||
export { default as range } from './range'
|
@ -0,0 +1,5 @@
|
||||
export default function* range(start = 1, end = 1) {
|
||||
while (start <= end) {
|
||||
yield start++
|
||||
}
|
||||
}
|
@ -0,0 +1,75 @@
|
||||
import URL from 'url'
|
||||
import {
|
||||
HAS_ALPHA_RE,
|
||||
IS_ALPHA_RE,
|
||||
IS_DIGIT_RE,
|
||||
PAGE_IN_HREF_RE,
|
||||
} from './constants'
|
||||
|
||||
// Take a URL, and return the article base of said URL. That is, no
|
||||
// pagination data exists in it. Useful for comparing to other links
|
||||
// that might have pagination data within them.
|
||||
export default function articleBaseUrl(url, parsedUrl) {
|
||||
parsedUrl = parsedUrl || URL.parse(url)
|
||||
const { protocol, host, path } = parsedUrl
|
||||
|
||||
let firstSegmentHasLetters = false
|
||||
const cleanedSegments = path.split('/')
|
||||
.reverse()
|
||||
.reduce((acc, segment, index) => {
|
||||
// Split off and save anything that looks like a file type.
|
||||
if (segment.includes('.')) {
|
||||
const [ possibleSegment, fileExt ] = segment.split('.')
|
||||
if (IS_ALPHA_RE.test(fileExt)) {
|
||||
segment = possibleSegment
|
||||
}
|
||||
}
|
||||
|
||||
// If our first or second segment has anything looking like a page
|
||||
// number, remove it.
|
||||
if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
|
||||
segment = segment.replace(PAGE_IN_HREF_RE, '')
|
||||
}
|
||||
|
||||
// If we're on the first segment, check to see if we have any
|
||||
// characters in it. The first segment is actually the last bit of
|
||||
// the URL, and this will be helpful to determine if we're on a URL
|
||||
// segment that looks like "/2/" for example.
|
||||
if (index === 0) {
|
||||
firstSegmentHasLetters = HAS_ALPHA_RE.test(segment)
|
||||
}
|
||||
|
||||
// If it's not marked for deletion, push it to cleaned_segments.
|
||||
if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
|
||||
acc.push(segment)
|
||||
}
|
||||
|
||||
return acc
|
||||
}, [])
|
||||
|
||||
return `${protocol}//${host}${cleanedSegments.reverse().join('/')}`
|
||||
}
|
||||
|
||||
function isGoodSegment(segment, index, firstSegmentHasLetters) {
|
||||
let goodSegment = true
|
||||
|
||||
// If this is purely a number, and it's the first or second
|
||||
// url_segment, it's probably a page number. Remove it.
|
||||
if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
|
||||
goodSegment = true
|
||||
}
|
||||
|
||||
// If this is the first url_segment and it's just "index",
|
||||
// remove it
|
||||
if (index === 0 && segment.toLowerCase() === 'index') {
|
||||
goodSegment = false
|
||||
}
|
||||
|
||||
// If our first or second url_segment is smaller than 3 characters,
|
||||
// and the first url_segment had no alphas, remove it.
|
||||
if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
|
||||
goodSegment = false
|
||||
}
|
||||
|
||||
return goodSegment
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import articleBaseUrl from './article-base-url'
|
||||
|
||||
describe('articleBaseUrl(url, parsedUrl)', () => {
|
||||
it('returns the base url of a paginated url', () => {
|
||||
const url = "http://example.com/foo/bar/wow-cool/page=10"
|
||||
const cleaned = "http://example.com/foo/bar/wow-cool"
|
||||
|
||||
assert.equal(articleBaseUrl(url), cleaned)
|
||||
})
|
||||
|
||||
it('returns same url if url has no pagination info', () => {
|
||||
const url = "http://example.com/foo/bar/wow-cool/"
|
||||
const cleaned = "http://example.com/foo/bar/wow-cool"
|
||||
|
||||
assert.equal(articleBaseUrl(url), cleaned)
|
||||
})
|
||||
})
|
||||
|
@ -0,0 +1,22 @@
|
||||
// An expression that looks to try to find the page digit within a URL, if
|
||||
// it exists.
|
||||
// Matches:
|
||||
// page=1
|
||||
// pg=1
|
||||
// p=1
|
||||
// paging=12
|
||||
// pag=7
|
||||
// pagination/1
|
||||
// paging/88
|
||||
// pa/83
|
||||
// p/11
|
||||
//
|
||||
// Does not match:
|
||||
// pg=102
|
||||
// page:2
|
||||
export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|\/)([0-9]{1,3})', 'i')
|
||||
|
||||
export const HAS_ALPHA_RE = /[a-z]/i
|
||||
|
||||
export const IS_ALPHA_RE = /^[a-z]+$/i
|
||||
export const IS_DIGIT_RE = /^[0-9]+$/i
|
@ -1,3 +1,6 @@
|
||||
export { default as normalizeSpaces } from './normalize-spaces'
|
||||
export { default as extractFromUrl } from './extract-from-url'
|
||||
export { default as pageNumFromUrl } from './page-num-from-url'
|
||||
export { default as removeAnchor } from './remove-anchor'
|
||||
export { default as articleBaseUrl } from './article-base-url'
|
||||
|
||||
|
@ -0,0 +1,12 @@
|
||||
import { PAGE_IN_HREF_RE } from './constants'
|
||||
|
||||
export default function pageNumFromUrl(url) {
|
||||
const matches = url.match(PAGE_IN_HREF_RE)
|
||||
if (!matches) return null
|
||||
|
||||
const pageNum = parseInt(matches[6])
|
||||
|
||||
// Return pageNum < 100, otherwise
|
||||
// return null
|
||||
return pageNum < 100 ? pageNum : null
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
import assert from 'assert'
|
||||
|
||||
import pageNumFromUrl from './page-num-from-url'
|
||||
|
||||
describe('pageNumFromUrl(url)', () => {
|
||||
it('returns null if there is no page num in the url', () => {
|
||||
const url1 = "http://example.com"
|
||||
assert.equal(pageNumFromUrl(url1), null)
|
||||
|
||||
const url2 = "http://example.com/?pg=102"
|
||||
assert.equal(pageNumFromUrl(url2), null)
|
||||
|
||||
const url3 = "http://example.com/?page:102"
|
||||
assert.equal(pageNumFromUrl(url3), null)
|
||||
})
|
||||
|
||||
it('returns a page num if one matches the url', () => {
|
||||
const url1 = "http://example.com/foo?page=1"
|
||||
assert.equal(pageNumFromUrl(url1), 1)
|
||||
|
||||
const url2 = "http://example.com/foo?pg=1"
|
||||
assert.equal(pageNumFromUrl(url2), 1)
|
||||
|
||||
const url3 = "http://example.com/foo?p=1"
|
||||
assert.equal(pageNumFromUrl(url3), 1)
|
||||
|
||||
const url4 = "http://example.com/foo?paging=1"
|
||||
assert.equal(pageNumFromUrl(url4), 1)
|
||||
|
||||
const url5 = "http://example.com/foo?pag=1"
|
||||
assert.equal(pageNumFromUrl(url5), 1)
|
||||
|
||||
const url6 = "http://example.com/foo?pagination/1"
|
||||
assert.equal(pageNumFromUrl(url6), 1)
|
||||
|
||||
const url7 = "http://example.com/foo?paging/88"
|
||||
assert.equal(pageNumFromUrl(url7), 88)
|
||||
|
||||
const url8 = "http://example.com/foo?pa/88"
|
||||
assert.equal(pageNumFromUrl(url8), 88)
|
||||
|
||||
const url9 = "http://example.com/foo?p/88"
|
||||
assert.equal(pageNumFromUrl(url9), 88)
|
||||
})
|
||||
})
|
@ -0,0 +1,3 @@
|
||||
export default function removeAnchor(url) {
|
||||
return url.split('#')[0].replace(/\/$/, '')
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
import assert from 'assert'
|
||||
|
||||
import removeAnchor from './remove-anchor'
|
||||
|
||||
describe('removeAnchor(url)', () => {
|
||||
it('returns a url w/out #anchor', () => {
|
||||
const url = "http://example.com/foo/bar/wow-cool/page=10/#wow"
|
||||
const cleaned = "http://example.com/foo/bar/wow-cool/page=10"
|
||||
|
||||
assert.equal(removeAnchor(url), cleaned)
|
||||
})
|
||||
|
||||
it('returns same url if url has no anchor found', () => {
|
||||
const url = "http://example.com/foo/bar/wow-cool"
|
||||
const cleaned = "http://example.com/foo/bar/wow-cool"
|
||||
|
||||
assert.equal(removeAnchor(url), cleaned)
|
||||
})
|
||||
})
|
||||
|
||||
|
Loading…
Reference in New Issue