feat: nextPageUrl handles multi-page articles

Squashed commit of the following:

commit b5070c0967a7f1a0c0c449ba7ea40aebe8fe4bb8
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 13 10:03:00 2016 -0400

    root extractor includes next page url

commit 79be83127d5342d89eef33665586fabea227d6b3
Author: Adam Pash <adam.pash@gmail.com>
Date:   Tue Sep 13 09:58:20 2016 -0400

    small score adjustment

commit 0f00507dbff43401145a892e849311518edec68a
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 12 18:17:38 2016 -0400

    feat: nextPageUrl generic parser up and running

commit be91c589fc0c6d6f9b573080a76c9b1ac7af710c
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 12 11:53:58 2016 -0400

    feat: pageNumFromUrl extracts the pagenum of the current url

commit ad879d7aabedadfd051c01b42d841703bf4763fa
Author: Adam Pash <adam.pash@gmail.com>
Date:   Mon Sep 12 11:52:37 2016 -0400

    feat: isWordpress checks if a page is generated by wordpress
pull/1/head
Adam Pash 8 years ago
parent a89b9b785e
commit 7ec0ed0d31

File diff suppressed because one or more lines are too long

@ -28,6 +28,7 @@
},
"dependencies": {
"cheerio": "^0.20.0",
"difflib": "^0.2.4",
"moment": "^2.14.1",
"request-promise": "^4.1.1",
"valid-url": "^1.0.9",

@ -263,67 +263,9 @@ export const NEGATIVE_SCORE_HINTS = [
// The above list, joined into a matching regular expression
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
// XPath to try to determine if a page is wordpress. Not always successful.
export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
// Match a digit. Pretty clear.
export const DIGIT_RE = new RegExp('[0-9]')
// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
export const EXTRANEOUS_LINK_HINTS = [
'print',
'archive',
'comment',
'discuss',
'e-mail',
'email',
'share',
'reply',
'all',
'login',
'sign',
'single',
'adx',
'entry-unrelated'
]
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
// page=1
// pg=1
// p=1
// paging=12
// pag=7
// pagination/1
// paging/88
// pa/83
// p/11
//
// Does not match:
// pg=102
// page:2
// DISABLING FOR NOW TODO AP
// export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
// Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
// Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
// Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
// Match any link text/classname/id that looks like it means the previous
// page.
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
// Match 2 or more consecutive <br> tags
export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i')

@ -6,6 +6,7 @@ import GenericAuthorExtractor from './author/extractor'
import GenericDatePublishedExtractor from './date-published/extractor'
import GenericDekExtractor from './dek/extractor'
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
import GenericNextPageUrlExtractor from './next-page-url/extractor'
const GenericExtractor = {
// This extractor is the default for all domains
@ -16,6 +17,7 @@ const GenericExtractor = {
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract,
extract: function(options) {
let { html } = options
@ -31,6 +33,7 @@ const GenericExtractor = {
const content = this.content({ ...options, title })
const leadImageUrl = this.leadImageUrl(options)
const dek = this.dek(options)
const nextPageUrl = this.nextPageUrl(options)
return {
title,
@ -39,6 +42,7 @@ const GenericExtractor = {
dek,
leadImageUrl,
content,
nextPageUrl,
}
}
}

@ -0,0 +1,54 @@
import 'babel-polyfill'
import URL from 'url'
import {
pageNumFromUrl,
articleBaseUrl,
removeAnchor,
} from 'utils/text'
import scoreLinks from './scoring/score-links'
// Looks for and returns next page url
// for multi-page articles
const GenericNextPageUrlExtractor = {
extract({ $, url, parsedUrl, previousUrls=[] }) {
parsedUrl = parsedUrl || URL.parse(url)
const currentPageNum = pageNumFromUrl(url)
const articleUrl = removeAnchor(url)
const baseUrl = articleBaseUrl(url, parsedUrl)
const { host } = parsedUrl
const links = $('a[href]').toArray()
const scoredLinks = scoreLinks({
links,
articleUrl,
baseUrl,
parsedUrl,
$,
previousUrls
})
// If no links were scored, return null
if (!scoredLinks) return null
// now that we've scored all possible pages,
// find the biggest one.
const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
const scoredLink = scoredLinks[link]
return scoredLink.score > acc.score ? scoredLink : acc
}, { score: -100 })
// If the score is less than 50, we're not confident enough to use it,
// so we fail.
if (topPage.score >= 50) {
return topPage.href
} else {
return null
}
}
}
export default GenericNextPageUrlExtractor

@ -0,0 +1,34 @@
import assert from 'assert'
import fs from 'fs'
import cheerio from 'cheerio'
import GenericNextPageUrlExtractor from './extractor'
describe('GenericNextPageUrlExtractor', () => {
it('returns most likely next page url', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
const $ = cheerio.load(html)
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'
const nextPage = GenericNextPageUrlExtractor.extract({
$,
url
})
assert.equal(nextPage, next)
})
it('returns null if there is no likely next page', () => {
const html = `<div><p>HI</p></div>`
const $ = cheerio.load(html)
const url = 'http://example.com/foo/bar'
const nextPage = GenericNextPageUrlExtractor.extract({
$,
url
})
assert.equal(nextPage, null)
})
})

@ -0,0 +1,38 @@
export const DIGIT_RE = /\d/
// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
export const EXTRANEOUS_LINK_HINTS = [
'print',
'archive',
'comment',
'discuss',
'e-mail',
'email',
'share',
'reply',
'all',
'login',
'sign',
'single',
'adx',
'entry-unrelated'
]
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
// Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
// Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
// Match any link text/classname/id that looks like it means the previous
// page.
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
// Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')

@ -0,0 +1,301 @@
import 'babel-polyfill'
import URL from 'url'
import difflib from 'difflib'
import { range } from 'utils'
import { isWordpress } from 'utils/dom'
import {
removeAnchor,
pageNumFromUrl,
} from 'utils/text'
import {
DIGIT_RE,
NEXT_LINK_TEXT_RE,
PREV_LINK_TEXT_RE,
EXTRANEOUS_LINK_HINTS_RE,
CAP_LINK_TEXT_RE,
PAGE_RE,
} from './constants'
import {
NEGATIVE_SCORE_RE,
POSITIVE_SCORE_RE,
} from 'utils/dom/constants'
import { IS_DIGIT_RE } from 'utils/text/constants'
export default function scoreLinks({
links,
articleUrl,
baseUrl,
parsedUrl,
$,
previousUrls=[]
}) {
parsedUrl = parsedUrl || URL.parse(articleUrl)
const baseRegex = makeBaseRegex(baseUrl)
const isWp = isWordpress($)
// Loop through all links, looking for hints that they may be next-page
// links. Things like having "page" in their textContent, className or
// id, or being a child of a node with a page-y className or id.
//
// After we do that, assign each page a score, and pick the one that
// looks most like the next page link, as long as its score is strong
// enough to have decent confidence.
const scoredPages = links.reduce((possiblePages, link) => {
// Remove any anchor data since we don't do a good job
// standardizing URLs (it's hard), we're going to do
// some checking with and without a trailing slash
let href = removeAnchor(link.attribs.href)
const $link = $(link)
const linkText = $link.text()
if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
return possiblePages
}
// ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
if (!possiblePages[href]) {
possiblePages[href] = {
score: 0,
linkText,
href,
}
} else {
possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`
}
const possiblePage = possiblePages[href]
const linkData = makeSig($link, linkText)
const pageNum = pageNumFromUrl(href)
let score = scoreBaseUrl(href, baseRegex)
score = score + scoreNextLinkText(linkData)
score = score + scoreCapLinks(linkData)
score = score + scorePrevLink(linkData)
score = score + scoreByParents($link)
score = score + scoreExtraneousLinks(href)
score = score + scorePageInLink(pageNum, isWp)
score = score + scoreLinkText(linkText, pageNum)
score = score + scoreSimilarity(score, articleUrl, href)
possiblePage.score = score
return possiblePages
}, {})
return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages
}
export function makeBaseRegex(baseUrl) {
return new RegExp(`^${baseUrl}`, 'i')
}
export function scoreSimilarity(score, articleUrl, href) {
// Do this last and only if we have a real candidate, because it's
// potentially expensive computationally. Compare the link to this
// URL using difflib to get the % similarity of these URLs. On a
// sliding scale, subtract points from this link based on
// similarity.
if (score > 0) {
const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio()
// Subtract .1 from diff_percent when calculating modifier,
// which means that if it's less than 10% different, we give a
// bonus instead. Ex:
// 3% different = +17.5 points
// 10% different = 0 points
// 20% different = -25 points
const diffPercent = 1.0 - similarity
const diffModifier = -(250 * (diffPercent - 0.2))
return score + diffModifier
}
return 0
}
export function scoreLinkText(linkText, pageNum) {
// If the link text can be parsed as a number, give it a minor
// bonus, with a slight bias towards lower numbered pages. This is
// so that pages that might not have 'next' in their text can still
// get scored, and sorted properly by score.
let score = 0
if (IS_DIGIT_RE.test(linkText.trim())) {
const linkTextAsNum = parseInt(linkText)
// If it's the first page, we already got it on the first call.
// Give it a negative score. Otherwise, up to page 10, give a
// small bonus.
if (linkTextAsNum < 2) {
score = -30
} else {
score = Math.max(0, 10 - linkTextAsNum)
}
// If it appears that the current page number is greater than
// this links page number, it's a very bad sign. Give it a big
// penalty.
if (pageNum && pageNum >= linkTextAsNum) {
score = score - 50
}
}
return score
}
export function scorePageInLink(pageNum, isWp) {
// page in the link = bonus. Intentionally ignore wordpress because
// their ?p=123 link style gets caught by this even though it means
// separate documents entirely.
if (pageNum && !isWp) {
return 50
}
return 0
}
export function scoreExtraneousLinks(href) {
// If the URL itself contains extraneous values, give a penalty.
if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
return -25
}
return 0
}
export function scoreByParents($link) {
// If a parent node contains paging-like classname or id, give a
// bonus. Additionally, if a parent_node contains bad content
// (like 'sponsor'), give a penalty.
let $parent = $link.parent()
let positiveMatch = false
let negativeMatch = false
let score = 0
Array.from(range(0, 4)).forEach((_) => {
if ($parent.length === 0) {
return
}
const parentData = makeSig($parent, ' ')
// If we have 'page' or 'paging' in our data, that's a good
// sign. Add a bonus.
if (!positiveMatch && PAGE_RE.test(parentData)) {
positiveMatch = true
score = score + 25
}
// If we have 'comment' or something in our data, and
// we don't have something like 'content' as well, that's
// a bad sign. Give a penalty.
if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
&& EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
if (!POSITIVE_SCORE_RE.test(parentData)) {
negativeMatch = true
score = score - 25
}
}
$parent = $parent.parent()
})
return score
}
export function scorePrevLink(linkData) {
// If the link has something like "previous", its definitely
// an old link, skip it.
if (PREV_LINK_TEXT_RE.test(linkData)) {
return -200
}
return 0
}
export function scoreCapLinks(linkData) {
// Cap links are links like "last", etc.
if (CAP_LINK_TEXT_RE.test(linkData)) {
// If we found a link like "last", but we've already seen that
// this link is also "next", it's fine. If it's not been
// previously marked as "next", then it's probably bad.
// Penalize.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return -65
}
}
return 0
}
export function scoreNextLinkText(linkData) {
// Things like "next", ">>", etc.
if (NEXT_LINK_TEXT_RE.test(linkData)) {
return 50
}
return 0
}
export function scoreBaseUrl(href, baseRegex) {
// If the baseUrl isn't part of this URL, penalize this
// link. It could still be the link, but the odds are lower.
// Example:
// http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
if (!baseRegex.test(href)) {
return -25
}
return 0
}
export function shouldScore(
href,
articleUrl,
baseUrl,
parsedUrl,
linkText,
previousUrls
) {
// skip if we've already fetched this url
if(previousUrls.find((url) => href === url) !== undefined) {
return false
}
// If we've already parsed this URL, or the URL matches the base
// URL, or is empty, skip it.
if (!href || href === articleUrl || href === baseUrl) {
return false
}
const { hostname } = parsedUrl
const { hostname: linkHost } = URL.parse(href)
// Domain mismatch.
if (linkHost !== hostname) {
return false
}
// If href doesn't contain a digit after removing the base URL,
// it's certainly not the next page.
const fragment = href.replace(baseUrl, '')
if (!DIGIT_RE.test(fragment)) {
return false
}
// This link has extraneous content (like "comment") in its link
// text, so we skip it.
if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) {
return false
}
// Next page link text is never long, skip if it is too long.
if (linkText.length > 25) {
return false
}
return true
}
function makeSig($link, linkText) {
return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`
}

@ -0,0 +1,239 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import URL from 'url'
import scoreLinks from './score-links'
import {
makeBaseRegex,
scoreBaseUrl,
scoreNextLinkText,
scoreCapLinks,
scorePrevLink,
scoreByParents,
scoreExtraneousLinks,
scorePageInLink,
scoreLinkText,
scoreSimilarity,
shouldScore,
} from './score-links'
describe('scoreLinks(links)', () => {
it('returns an object of scored links', () => {
const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
const $ = cheerio.load(html)
const links = $('a[href]').toArray()
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const scoredPages = scoreLinks({
links,
articleUrl: url,
baseUrl: 'http://arstechnica.com',
$,
})
assert.equal(typeof scoredPages, 'object')
})
it('returns null if no possible pages', () => {
const html = `<div><p>Hello wow</p></div>`
const $ = cheerio.load(html)
const links = $('a[href]').toArray()
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const scoredPages = scoreLinks({
links,
articleUrl: url,
baseUrl: 'http://arstechnica.com',
$,
})
assert.equal(scoredPages, null)
})
})
describe('scoreBaseUrl(href, baseRegex)', () => {
it('returns -25 if url does not contain the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://foo.com/foo/bar'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), -25)
})
it('returns 0 if url contains the base url', () => {
const baseUrl = 'http://example.com/foo/bar'
const badUrl = 'http://example.com/foo/bar/bat'
const baseRegex = makeBaseRegex(baseUrl)
assert.equal(scoreBaseUrl(badUrl, baseRegex), 0)
})
})
describe('scoreNextLinkText(linkData)', () => {
it('returns 50 if contains common next link text', () => {
const linkData = "foo bar Next page"
assert.equal(scoreNextLinkText(linkData), 50)
})
it('returns 0 if does not contain common next link text', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreNextLinkText(linkData), 0)
})
})
describe('scoreCapLinks(linkData)', () => {
it('returns -65 if cap link with next link text', () => {
const linkData = "foo next Last page"
assert.equal(scoreCapLinks(linkData), -65)
})
it('returns 0 if does not match a cap link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scorePrevLink(linkData)', () => {
it('returns -200 if link matches previous text', () => {
const linkData = "foo next previous page"
assert.equal(scorePrevLink(linkData), -200)
})
it('returns 0 if does not match a prev link', () => {
const linkData = "foo bar WOW GREAT"
assert.equal(scoreCapLinks(linkData), 0)
})
})
describe('scoreByParents($link)', () => {
it('returns 25 if parent sig looks like a page', () => {
const html = `
<div>
<div class="next-page">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), 25)
})
it('returns -25 if parent sig looks like a comment', () => {
const html = `
<div>
<div class="comment">
<a href="blah">Next page</a>
</div>
</div>
`
const $ = cheerio.load(html)
const $link = $('a').first()
assert.equal(scoreByParents($link), -25)
})
})
describe('scoreExtraneousLinks(href)', () => {
it('returns -25 if link matches extraneous text', () => {
const url = "http://example.com/email-link"
assert.equal(scoreExtraneousLinks(url), -25)
})
it('returns 0 if does not match extraneous text', () => {
const url = "http://example.com/asdf"
assert.equal(scoreExtraneousLinks(url), 0)
})
})
describe('scorePageInLink(pageNum, isWp)', () => {
it('returns 50 if link contains a page num', () => {
assert.equal(scorePageInLink(1, false), 50)
})
it('returns 0 if link contains no page num', () => {
assert.equal(scorePageInLink(null, false), 0)
})
it('returns 0 if page is wordpress', () => {
assert.equal(scorePageInLink(10, true), 0)
})
})
describe('scoreLinkText(linkText)', () => {
it('returns 8 if link contains the num 2', () => {
assert.equal(scoreLinkText('2', 0), 8)
})
it('returns 5 if link contains the num 5', () => {
assert.equal(scoreLinkText('5', 0), 5)
})
it('returns -30 if link contains the number 1', () => {
assert.equal(scoreLinkText('1', 0), -30)
})
it('penalizes -50 if pageNum is >= link text as num', () => {
assert.equal(scoreLinkText('4', 5), -44)
})
})
describe('scoreSimilarity(score, articleUrl, href)', () => {
it('returns a similarity bonus based on current score', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 25
assert.equal(
Math.round(scoreSimilarity(score, articleUrl, href)),
66
)
})
it('returns 0 is current score <= 0', () => {
const articleUrl = 'http://example.com/foo/bar'
const href = 'http://example.com/foo/bar/2'
const score = 0
assert.equal(scoreSimilarity(score, articleUrl, href), 0)
})
})
describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
it('returns false if href has already been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar/2' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
false
)
})
it('returns true if href has not been fetched', () => {
const previousUrls = [ 'http://example.com/foo/bar' ]
const href = 'http://example.com/foo/bar/2'
const parsedUrl = URL.parse(href)
assert.equal(
shouldScore(href, '', '', parsedUrl, '', previousUrls),
true
)
})
})

@ -7,7 +7,7 @@ import { ATTR_RE } from './constants'
const RootExtractor = {
extract(extractor=GenericExtractor, opts) {
const { $ } = opts
const { $, contentOnly, extractedTitle } = opts
// This is the generic extractor. Run its extract method
if (extractor.domain === '*') return extractor.extract(opts)
@ -16,23 +16,33 @@ const RootExtractor = {
extractor
}
const title = extract({ ...opts, type: 'title' })
const datePublished = extract({ ...opts, type: 'datePublished' })
const author = extract({ ...opts, type: 'author' })
const content = extract({
...opts, type: 'content', extractHtml: true, title
})
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', content })
const dek = extract({ ...opts, type: 'dek', content })
return {
title,
content,
author,
datePublished,
leadImageUrl,
dek,
if (contentOnly) {
const content = extract({
...opts, type: 'content', extractHtml: true, title: extractedTitle
})
return {
content
}
} else {
const title = extract({ ...opts, type: 'title' })
const datePublished = extract({ ...opts, type: 'datePublished' })
const author = extract({ ...opts, type: 'author' })
const nextPageUrl = extract({ ...opts, type: 'nextPageUrl' })
const content = extract({
...opts, type: 'content', extractHtml: true, title
})
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', content })
const dek = extract({ ...opts, type: 'dek', content })
return {
title,
content,
author,
datePublished,
leadImageUrl,
dek,
}
}
}
}

@ -3,10 +3,12 @@ import fs from 'fs'
import Resource from 'resource'
import getExtractor from 'extractors/get-extractor'
import RootExtractor from 'extractors/root-extractor'
import { removeAnchor } from 'utils/text'
const Iris = {
parse: async function(url, html) {
const $ = await Resource.create(url, html)
parse: async function(url, html, opts={}) {
const { fetchAllPages=true } = opts || true
let $ = await Resource.create(url, html)
html = $.html()
const Extractor = getExtractor(url)
@ -18,9 +20,61 @@ const Iris = {
return $(node).attr('name')
}).toArray()
const result = RootExtractor.extract(Extractor, { url, html, $, metaCache })
let extractorOpts = { url, html, $, metaCache }
let result = RootExtractor.extract(Extractor, extractorOpts)
let { nextPageUrl, title } = result
if (fetchAllPages && nextPageUrl) {
result = await collectAllPages({ nextPageUrl, html, $, metaCache, result, Extractor, title, url })
}
return result
}
}
async function collectAllPages({
nextPageUrl,
html,
$,
metaCache,
result,
Extractor,
title,
url
}) {
let pages = 2
let previousUrls = [removeAnchor(url)]
while (nextPageUrl && pages < 26) {
$ = await Resource.create(nextPageUrl)
html = $.html()
let extractorOpts = { url: nextPageUrl, html, $, metaCache }
let nextPageResult = RootExtractor.extract(
Extractor,
{
...extractorOpts,
url: nextPageUrl,
contentOnly: true,
extractedTitle: title,
previousUrls
}
)
previousUrls.push(nextPageUrl)
result = {
...result,
content: `
${result.content}
<hr>
<h4>Page ${pages}</h4>
${nextPageResult.content}
`
}
nextPageUrl = nextPageResult.nextPageUrl
pages = pages + 1
}
return result
}
export default Iris

@ -29,5 +29,18 @@ describe('Iris', function() {
// console.log(result)
})
it('does ars pagination', async function() {
const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
const result = await Iris.parse(
url,
null,
{ fetchAllPages: true }
)
// console.log(result)
assert.equal(result.nextPageUrl, `${url}2`)
// console.log(result.content)
})
})
})

@ -301,7 +301,7 @@ export const NEGATIVE_SCORE_HINTS = [
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
// XPath to try to determine if a page is wordpress. Not always successful.
export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
export const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'
// Match a digit. Pretty clear.
export const DIGIT_RE = new RegExp('[0-9]')

@ -19,3 +19,4 @@ export { default as extractFromSelectors } from './extract-from-selectors'
export { default as stripTags } from './strip-tags'
export { default as withinComment } from './within-comment'
export { default as nodeIsSufficient } from './node-is-sufficient'
export { default as isWordpress } from './is-wordpress'

@ -0,0 +1,5 @@
import { IS_WP_SELECTOR } from './constants'
export default function isWordpress($) {
return $(IS_WP_SELECTOR).length > 0
}

@ -0,0 +1,43 @@
import assert from 'assert'
import cheerio from 'cheerio'
import isWordpress from './is-wordpress'
describe('isWordpress($)', () => {
it('returns false if a site is not generated by wordpress', () => {
const html = `
<html>
<head>
<meta name="generator" value="whatever">
<head>
</html>
`
let $ = cheerio.load(html)
assert.equal(isWordpress($), false)
const html2 = `
<html>
<head>
<meta name="foo" value="bar">
<head>
</html>
`
$ = cheerio.load(html)
assert.equal(isWordpress($), false)
})
it('returns true if a site is generated by wordpress', () => {
const html = `
<html>
<head>
<meta name="generator" value="WordPress 4.7-alpha-38592">
<head>
</html>
`
const $ = cheerio.load(html)
assert.equal(isWordpress($), true)
})
})

@ -0,0 +1 @@
export { default as range } from './range'

@ -0,0 +1,5 @@
export default function* range(start = 1, end = 1) {
while (start <= end) {
yield start++
}
}

@ -0,0 +1,75 @@
import URL from 'url'
import {
HAS_ALPHA_RE,
IS_ALPHA_RE,
IS_DIGIT_RE,
PAGE_IN_HREF_RE,
} from './constants'
// Take a URL, and return the article base of said URL. That is, no
// pagination data exists in it. Useful for comparing to other links
// that might have pagination data within them.
export default function articleBaseUrl(url, parsedUrl) {
parsedUrl = parsedUrl || URL.parse(url)
const { protocol, host, path } = parsedUrl
let firstSegmentHasLetters = false
const cleanedSegments = path.split('/')
.reverse()
.reduce((acc, segment, index) => {
// Split off and save anything that looks like a file type.
if (segment.includes('.')) {
const [ possibleSegment, fileExt ] = segment.split('.')
if (IS_ALPHA_RE.test(fileExt)) {
segment = possibleSegment
}
}
// If our first or second segment has anything looking like a page
// number, remove it.
if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
segment = segment.replace(PAGE_IN_HREF_RE, '')
}
// If we're on the first segment, check to see if we have any
// characters in it. The first segment is actually the last bit of
// the URL, and this will be helpful to determine if we're on a URL
// segment that looks like "/2/" for example.
if (index === 0) {
firstSegmentHasLetters = HAS_ALPHA_RE.test(segment)
}
// If it's not marked for deletion, push it to cleaned_segments.
if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
acc.push(segment)
}
return acc
}, [])
return `${protocol}//${host}${cleanedSegments.reverse().join('/')}`
}
function isGoodSegment(segment, index, firstSegmentHasLetters) {
let goodSegment = true
// If this is purely a number, and it's the first or second
// url_segment, it's probably a page number. Remove it.
if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
goodSegment = true
}
// If this is the first url_segment and it's just "index",
// remove it
if (index === 0 && segment.toLowerCase() === 'index') {
goodSegment = false
}
// If our first or second url_segment is smaller than 3 characters,
// and the first url_segment had no alphas, remove it.
if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
goodSegment = false
}
return goodSegment
}

@ -0,0 +1,21 @@
import assert from 'assert'
import cheerio from 'cheerio'
import articleBaseUrl from './article-base-url'
describe('articleBaseUrl(url, parsedUrl)', () => {
it('returns the base url of a paginated url', () => {
const url = "http://example.com/foo/bar/wow-cool/page=10"
const cleaned = "http://example.com/foo/bar/wow-cool"
assert.equal(articleBaseUrl(url), cleaned)
})
it('returns same url if url has no pagination info', () => {
const url = "http://example.com/foo/bar/wow-cool/"
const cleaned = "http://example.com/foo/bar/wow-cool"
assert.equal(articleBaseUrl(url), cleaned)
})
})

@ -0,0 +1,22 @@
// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
// page=1
// pg=1
// p=1
// paging=12
// pag=7
// pagination/1
// paging/88
// pa/83
// p/11
//
// Does not match:
// pg=102
// page:2
export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|\/)([0-9]{1,3})', 'i')
export const HAS_ALPHA_RE = /[a-z]/i
export const IS_ALPHA_RE = /^[a-z]+$/i
export const IS_DIGIT_RE = /^[0-9]+$/i

@ -1,3 +1,6 @@
export { default as normalizeSpaces } from './normalize-spaces'
export { default as extractFromUrl } from './extract-from-url'
export { default as pageNumFromUrl } from './page-num-from-url'
export { default as removeAnchor } from './remove-anchor'
export { default as articleBaseUrl } from './article-base-url'

@ -0,0 +1,12 @@
import { PAGE_IN_HREF_RE } from './constants'
export default function pageNumFromUrl(url) {
const matches = url.match(PAGE_IN_HREF_RE)
if (!matches) return null
const pageNum = parseInt(matches[6])
// Return pageNum < 100, otherwise
// return null
return pageNum < 100 ? pageNum : null
}

@ -0,0 +1,45 @@
import assert from 'assert'
import pageNumFromUrl from './page-num-from-url'
describe('pageNumFromUrl(url)', () => {
it('returns null if there is no page num in the url', () => {
const url1 = "http://example.com"
assert.equal(pageNumFromUrl(url1), null)
const url2 = "http://example.com/?pg=102"
assert.equal(pageNumFromUrl(url2), null)
const url3 = "http://example.com/?page:102"
assert.equal(pageNumFromUrl(url3), null)
})
it('returns a page num if one matches the url', () => {
const url1 = "http://example.com/foo?page=1"
assert.equal(pageNumFromUrl(url1), 1)
const url2 = "http://example.com/foo?pg=1"
assert.equal(pageNumFromUrl(url2), 1)
const url3 = "http://example.com/foo?p=1"
assert.equal(pageNumFromUrl(url3), 1)
const url4 = "http://example.com/foo?paging=1"
assert.equal(pageNumFromUrl(url4), 1)
const url5 = "http://example.com/foo?pag=1"
assert.equal(pageNumFromUrl(url5), 1)
const url6 = "http://example.com/foo?pagination/1"
assert.equal(pageNumFromUrl(url6), 1)
const url7 = "http://example.com/foo?paging/88"
assert.equal(pageNumFromUrl(url7), 88)
const url8 = "http://example.com/foo?pa/88"
assert.equal(pageNumFromUrl(url8), 88)
const url9 = "http://example.com/foo?p/88"
assert.equal(pageNumFromUrl(url9), 88)
})
})

@ -0,0 +1,3 @@
export default function removeAnchor(url) {
return url.split('#')[0].replace(/\/$/, '')
}

@ -0,0 +1,21 @@
import assert from 'assert'
import removeAnchor from './remove-anchor'
describe('removeAnchor(url)', () => {
it('returns a url w/out #anchor', () => {
const url = "http://example.com/foo/bar/wow-cool/page=10/#wow"
const cleaned = "http://example.com/foo/bar/wow-cool/page=10"
assert.equal(removeAnchor(url), cleaned)
})
it('returns same url if url has no anchor found', () => {
const url = "http://example.com/foo/bar/wow-cool"
const cleaned = "http://example.com/foo/bar/wow-cool"
assert.equal(removeAnchor(url), cleaned)
})
})
Loading…
Cancel
Save