feat: nextPageUrl handles multi-page articles

Squashed commit of the following: commit b5070c0967a7f1a0c0c449ba7ea40aebe8fe4bb8 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 13 10:03:00 2016 -0400 root extractor includes next page url commit 79be83127d5342d89eef33665586fabea227d6b3 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Sep 13 09:58:20 2016 -0400 small score adjustment commit 0f00507dbff43401145a892e849311518edec68a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 12 18:17:38 2016 -0400 feat: nextPageUrl generic parser up and running commit be91c589fc0c6d6f9b573080a76c9b1ac7af710c Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 12 11:53:58 2016 -0400 feat: pageNumFromUrl extracts the pagenum of the current url commit ad879d7aabedadfd051c01b42d841703bf4763fa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Sep 12 11:52:37 2016 -0400 feat: isWordpress checks if a page is generated by wordpress
8 years ago · 7ec0ed0d31
parent a89b9b785e
commit 7ec0ed0d31
26 changed files with 1566 additions and 79 deletions
--- a/fixtures/ars.html
+++ b/fixtures/ars.html
--- a/package.json
+++ b/package.json
@ -28,6 +28,7 @@
  },
  "dependencies": {
    "cheerio": "^0.20.0",
+    "difflib": "^0.2.4",
    "moment": "^2.14.1",
    "request-promise": "^4.1.1",
    "valid-url": "^1.0.9",
--- a/src/extractors/generic/content/utils/constants.js
+++ b/src/extractors/generic/content/utils/constants.js
@ -263,67 +263,9 @@ export const NEGATIVE_SCORE_HINTS = [
 // The above list, joined into a matching regular expression
 export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')

-// XPath to try to determine if a page is wordpress. Not always successful.
-export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
-
 // Match a digit. Pretty clear.
 export const DIGIT_RE = new RegExp('[0-9]')

-// A list of words that, if found in link text or URLs, likely mean that
-// this link is not a next page link.
-export const EXTRANEOUS_LINK_HINTS = [
-    'print',
-    'archive',
-    'comment',
-    'discuss',
-    'e-mail',
-    'email',
-    'share',
-    'reply',
-    'all',
-    'login',
-    'sign',
-    'single',
-    'adx',
-    'entry-unrelated'
-]
-export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
-
-// An expression that looks to try to find the page digit within a URL, if
-// it exists.
-// Matches:
-//  page=1
-//  pg=1
-//  p=1
-//  paging=12
-//  pag=7
-//  pagination/1
-//  paging/88
-//  pa/83
-//  p/11
-//
-// Does not match:
-//  pg=102
-//  page:2
-// DISABLING FOR NOW TODO AP
-// export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
-
-// Match any phrase that looks like it could be page, or paging, or pagination
-export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
-
-// Match any link text/classname/id that looks like it could mean the next
-// page. Things like: next, continue, >, >>, » but not >|, »| as those can
-// mean last page.
-export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
-
-// Match any link text/classname/id that looks like it is an end link: things
-// like "first", "last", "end", etc.
-export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
-
-// Match any link text/classname/id that looks like it means the previous
-// page.
-export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
-
 // Match 2 or more consecutive <br> tags
 export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i')

--- a/src/extractors/generic/index.js
+++ b/src/extractors/generic/index.js
@ -6,6 +6,7 @@ import GenericAuthorExtractor from './author/extractor'
 import GenericDatePublishedExtractor from './date-published/extractor'
 import GenericDekExtractor from './dek/extractor'
 import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
+import GenericNextPageUrlExtractor from './next-page-url/extractor'

 const GenericExtractor = {
  // This extractor is the default for all domains
@ -16,6 +17,7 @@ const GenericExtractor = {
  content: GenericContentExtractor.extract.bind(GenericContentExtractor),
  leadImageUrl: GenericLeadImageUrlExtractor.extract,
  dek: GenericDekExtractor.extract,
+  nextPageUrl: GenericNextPageUrlExtractor.extract,

  extract: function(options) {
    let { html } = options
@ -31,6 +33,7 @@ const GenericExtractor = {
    const content = this.content({ ...options, title })
    const leadImageUrl = this.leadImageUrl(options)
    const dek = this.dek(options)
+    const nextPageUrl = this.nextPageUrl(options)

    return {
      title,
@ -39,6 +42,7 @@ const GenericExtractor = {
      dek,
      leadImageUrl,
      content,
+      nextPageUrl,
    }
  }
 }
--- a/src/extractors/generic/next-page-url/extractor.js
+++ b/src/extractors/generic/next-page-url/extractor.js
@ -0,0 +1,54 @@
+import 'babel-polyfill'
+import URL from 'url'
+
+import {
+  pageNumFromUrl,
+  articleBaseUrl,
+  removeAnchor,
+} from 'utils/text'
+import scoreLinks from './scoring/score-links'
+
+// Looks for and returns next page url
+// for multi-page articles
+const GenericNextPageUrlExtractor = {
+  extract({ $, url, parsedUrl, previousUrls=[] }) {
+    parsedUrl = parsedUrl || URL.parse(url)
+
+    const currentPageNum = pageNumFromUrl(url)
+    const articleUrl = removeAnchor(url)
+    const baseUrl = articleBaseUrl(url, parsedUrl)
+    const { host } = parsedUrl
+
+    const links = $('a[href]').toArray()
+
+    const scoredLinks = scoreLinks({
+      links,
+      articleUrl,
+      baseUrl,
+      parsedUrl,
+      $,
+      previousUrls
+    })
+
+    // If no links were scored, return null
+    if (!scoredLinks) return null
+
+    // now that we've scored all possible pages,
+    // find the biggest one.
+    const topPage = Reflect.ownKeys(scoredLinks).reduce((acc, link) => {
+      const scoredLink = scoredLinks[link]
+      return scoredLink.score > acc.score ? scoredLink : acc
+    }, { score: -100 })
+
+    // If the score is less than 50, we're not confident enough to use it,
+    // so we fail.
+    if (topPage.score >= 50) {
+      return topPage.href
+    } else {
+      return null
+    }
+  }
+}
+
+
+export default GenericNextPageUrlExtractor
--- a/src/extractors/generic/next-page-url/extractor.test.js
+++ b/src/extractors/generic/next-page-url/extractor.test.js
@ -0,0 +1,34 @@
+import assert from 'assert'
+import fs from 'fs'
+import cheerio from 'cheerio'
+
+import GenericNextPageUrlExtractor from './extractor'
+
+describe('GenericNextPageUrlExtractor', () => {
+  it('returns most likely next page url', () => {
+    const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
+    const $ = cheerio.load(html)
+    const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
+    const next = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/2'
+
+    const nextPage = GenericNextPageUrlExtractor.extract({
+      $,
+      url
+    })
+
+    assert.equal(nextPage, next)
+  })
+
+  it('returns null if there is no likely next page', () => {
+    const html = `<div><p>HI</p></div>`
+    const $ = cheerio.load(html)
+    const url = 'http://example.com/foo/bar'
+
+    const nextPage = GenericNextPageUrlExtractor.extract({
+      $,
+      url
+    })
+
+    assert.equal(nextPage, null)
+  })
+})
--- a/src/extractors/generic/next-page-url/scoring/constants.js
+++ b/src/extractors/generic/next-page-url/scoring/constants.js
@ -0,0 +1,38 @@
+export const DIGIT_RE = /\d/
+
+// A list of words that, if found in link text or URLs, likely mean that
+// this link is not a next page link.
+export const EXTRANEOUS_LINK_HINTS = [
+    'print',
+    'archive',
+    'comment',
+    'discuss',
+    'e-mail',
+    'email',
+    'share',
+    'reply',
+    'all',
+    'login',
+    'sign',
+    'single',
+    'adx',
+    'entry-unrelated'
+]
+export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
+
+// Match any link text/classname/id that looks like it could mean the next
+// page. Things like: next, continue, >, >>, » but not >|, »| as those can
+// mean last page.
+export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
+
+// Match any link text/classname/id that looks like it is an end link: things
+// like "first", "last", "end", etc.
+export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
+
+// Match any link text/classname/id that looks like it means the previous
+// page.
+export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
+
+// Match any phrase that looks like it could be page, or paging, or pagination
+export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
+
--- a/src/extractors/generic/next-page-url/scoring/score-links.js
+++ b/src/extractors/generic/next-page-url/scoring/score-links.js
@ -0,0 +1,301 @@
+import 'babel-polyfill'
+import URL from 'url'
+import difflib from 'difflib'
+
+import { range } from 'utils'
+import { isWordpress } from 'utils/dom'
+import {
+  removeAnchor,
+  pageNumFromUrl,
+} from 'utils/text'
+import {
+  DIGIT_RE,
+  NEXT_LINK_TEXT_RE,
+  PREV_LINK_TEXT_RE,
+  EXTRANEOUS_LINK_HINTS_RE,
+  CAP_LINK_TEXT_RE,
+  PAGE_RE,
+} from './constants'
+
+import {
+  NEGATIVE_SCORE_RE,
+  POSITIVE_SCORE_RE,
+} from 'utils/dom/constants'
+import { IS_DIGIT_RE } from 'utils/text/constants'
+
+export default function scoreLinks({
+  links,
+  articleUrl,
+  baseUrl,
+  parsedUrl,
+  $,
+  previousUrls=[]
+}) {
+  parsedUrl = parsedUrl || URL.parse(articleUrl)
+  const baseRegex = makeBaseRegex(baseUrl)
+  const isWp = isWordpress($)
+
+  // Loop through all links, looking for hints that they may be next-page
+  // links. Things like having "page" in their textContent, className or
+  // id, or being a child of a node with a page-y className or id.
+  //
+  // After we do that, assign each page a score, and pick the one that
+  // looks most like the next page link, as long as its score is strong
+  // enough to have decent confidence.
+  const scoredPages = links.reduce((possiblePages, link) => {
+    // Remove any anchor data since we don't do a good job
+    // standardizing URLs (it's hard), we're going to do
+    // some checking with and without a trailing slash
+    let href = removeAnchor(link.attribs.href)
+    const $link = $(link)
+    const linkText = $link.text()
+
+    if (!shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)) {
+      return possiblePages
+    }
+
+    // ## PASSED THE FIRST-PASS TESTS. Start scoring. ##
+    if (!possiblePages[href]) {
+      possiblePages[href] = {
+        score: 0,
+        linkText,
+        href,
+      }
+    } else {
+      possiblePages[href].linkText = `${possiblePages[href].linkText}|${linkText}`
+    }
+
+    const possiblePage = possiblePages[href]
+    const linkData = makeSig($link, linkText)
+    const pageNum = pageNumFromUrl(href)
+
+    let score = scoreBaseUrl(href, baseRegex)
+    score = score + scoreNextLinkText(linkData)
+    score = score + scoreCapLinks(linkData)
+    score = score + scorePrevLink(linkData)
+    score = score + scoreByParents($link)
+    score = score + scoreExtraneousLinks(href)
+    score = score + scorePageInLink(pageNum, isWp)
+    score = score + scoreLinkText(linkText, pageNum)
+    score = score + scoreSimilarity(score, articleUrl, href)
+
+    possiblePage.score = score
+
+    return possiblePages
+  }, {})
+
+  return Reflect.ownKeys(scoredPages).length === 0 ? null : scoredPages
+}
+
+export function makeBaseRegex(baseUrl) {
+  return new RegExp(`^${baseUrl}`, 'i')
+}
+
+export function scoreSimilarity(score, articleUrl, href) {
+  // Do this last and only if we have a real candidate, because it's
+  // potentially expensive computationally. Compare the link to this
+  // URL using difflib to get the % similarity of these URLs. On a
+  // sliding scale, subtract points from this link based on
+  // similarity.
+  if (score > 0) {
+    const similarity = new difflib.SequenceMatcher(null, articleUrl, href).ratio()
+    // Subtract .1 from diff_percent when calculating modifier,
+    // which means that if it's less than 10% different, we give a
+    // bonus instead. Ex:
+    //  3% different = +17.5 points
+    // 10% different = 0 points
+    // 20% different = -25 points
+    const diffPercent = 1.0 - similarity
+    const diffModifier = -(250 * (diffPercent - 0.2))
+    return score + diffModifier
+  }
+
+  return 0
+}
+
+export function scoreLinkText(linkText, pageNum) {
+  // If the link text can be parsed as a number, give it a minor
+  // bonus, with a slight bias towards lower numbered pages. This is
+  // so that pages that might not have 'next' in their text can still
+  // get scored, and sorted properly by score.
+  let score = 0
+
+  if (IS_DIGIT_RE.test(linkText.trim())) {
+    const linkTextAsNum = parseInt(linkText)
+    // If it's the first page, we already got it on the first call.
+    // Give it a negative score. Otherwise, up to page 10, give a
+    // small bonus.
+    if (linkTextAsNum < 2) {
+      score = -30
+    } else {
+      score = Math.max(0, 10 - linkTextAsNum)
+    }
+
+    // If it appears that the current page number is greater than
+    // this links page number, it's a very bad sign. Give it a big
+    // penalty.
+    if (pageNum && pageNum >= linkTextAsNum) {
+      score = score - 50
+    }
+  }
+
+  return score
+}
+
+export function scorePageInLink(pageNum, isWp) {
+  // page in the link = bonus. Intentionally ignore wordpress because
+  // their ?p=123 link style gets caught by this even though it means
+  // separate documents entirely.
+  if (pageNum && !isWp) {
+    return 50
+  }
+
+  return 0
+}
+
+export function scoreExtraneousLinks(href) {
+  // If the URL itself contains extraneous values, give a penalty.
+  if (EXTRANEOUS_LINK_HINTS_RE.test(href)) {
+    return -25
+  }
+
+  return 0
+}
+
+export function scoreByParents($link) {
+  // If a parent node contains paging-like classname or id, give a
+  // bonus. Additionally, if a parent_node contains bad content
+  // (like 'sponsor'), give a penalty.
+  let $parent = $link.parent()
+  let positiveMatch = false
+  let negativeMatch = false
+  let score = 0
+
+  Array.from(range(0, 4)).forEach((_) => {
+    if ($parent.length === 0) {
+      return
+    }
+
+    const parentData = makeSig($parent, ' ')
+
+    // If we have 'page' or 'paging' in our data, that's a good
+    // sign. Add a bonus.
+    if (!positiveMatch && PAGE_RE.test(parentData)) {
+      positiveMatch = true
+      score = score + 25
+    }
+
+    // If we have 'comment' or something in our data, and
+    // we don't have something like 'content' as well, that's
+    // a bad sign. Give a penalty.
+    if (!negativeMatch && NEGATIVE_SCORE_RE.test(parentData)
+       && EXTRANEOUS_LINK_HINTS_RE.test(parentData)) {
+         if (!POSITIVE_SCORE_RE.test(parentData)) {
+           negativeMatch = true
+           score = score - 25
+         }
+    }
+
+    $parent = $parent.parent()
+  })
+
+  return score
+}
+
+export function scorePrevLink(linkData) {
+  // If the link has something like "previous", its definitely
+  // an old link, skip it.
+  if (PREV_LINK_TEXT_RE.test(linkData)) {
+    return -200
+  }
+
+  return 0
+}
+
+export function scoreCapLinks(linkData) {
+  // Cap links are links like "last", etc.
+  if (CAP_LINK_TEXT_RE.test(linkData)) {
+    // If we found a link like "last", but we've already seen that
+    // this link is also "next", it's fine. If it's not been
+    // previously marked as "next", then it's probably bad.
+    // Penalize.
+    if (NEXT_LINK_TEXT_RE.test(linkData)) {
+      return -65
+    }
+  }
+
+  return 0
+}
+
+export function scoreNextLinkText(linkData) {
+  // Things like "next", ">>", etc.
+  if (NEXT_LINK_TEXT_RE.test(linkData)) {
+    return 50
+  }
+
+  return 0
+}
+
+export function scoreBaseUrl(href, baseRegex) {
+  // If the baseUrl isn't part of this URL, penalize this
+  // link. It could still be the link, but the odds are lower.
+  // Example:
+  // http://www.actionscript.org/resources/articles/745/1/JavaScript-and-VBScript-Injection-in-ActionScript-3/Page1.html
+  if (!baseRegex.test(href)) {
+    return -25
+  }
+
+  return 0
+}
+
+export function shouldScore(
+  href,
+  articleUrl,
+  baseUrl,
+  parsedUrl,
+  linkText,
+  previousUrls
+) {
+  // skip if we've already fetched this url
+  if(previousUrls.find((url) => href === url) !== undefined) {
+    return false
+  }
+
+  // If we've already parsed this URL, or the URL matches the base
+  // URL, or is empty, skip it.
+  if (!href || href === articleUrl || href === baseUrl) {
+    return false
+  }
+
+  const { hostname } = parsedUrl
+  const { hostname: linkHost } = URL.parse(href)
+
+  // Domain mismatch.
+  if (linkHost !== hostname) {
+    return false
+  }
+
+  // If href doesn't contain a digit after removing the base URL,
+  // it's certainly not the next page.
+  const fragment = href.replace(baseUrl, '')
+  if (!DIGIT_RE.test(fragment)) {
+    return false
+  }
+
+  // This link has extraneous content (like "comment") in its link
+  // text, so we skip it.
+  if (EXTRANEOUS_LINK_HINTS_RE.test(linkText)) {
+    return false
+  }
+
+  // Next page link text is never long, skip if it is too long.
+  if (linkText.length > 25) {
+    return false
+  }
+
+  return true
+}
+
+function makeSig($link, linkText) {
+  return `${linkText || $link.text()} ${$link.attr('class') || ''} ${$link.attr('id') || ''}`
+}
--- a/src/extractors/generic/next-page-url/scoring/score-links.test.js
+++ b/src/extractors/generic/next-page-url/scoring/score-links.test.js
@ -0,0 +1,239 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+import fs from 'fs'
+import URL from 'url'
+
+import scoreLinks from './score-links'
+import {
+  makeBaseRegex,
+  scoreBaseUrl,
+  scoreNextLinkText,
+  scoreCapLinks,
+  scorePrevLink,
+  scoreByParents,
+  scoreExtraneousLinks,
+  scorePageInLink,
+  scoreLinkText,
+  scoreSimilarity,
+  shouldScore,
+} from './score-links'
+
+describe('scoreLinks(links)', () => {
+  it('returns an object of scored links', () => {
+    const html = fs.readFileSync('./fixtures/ars.html', 'utf8')
+
+    const $ = cheerio.load(html)
+    const links = $('a[href]').toArray()
+    const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
+
+    const scoredPages = scoreLinks({
+      links,
+      articleUrl: url,
+      baseUrl: 'http://arstechnica.com',
+      $,
+    })
+
+    assert.equal(typeof scoredPages, 'object')
+  })
+
+  it('returns null if no possible pages', () => {
+    const html = `<div><p>Hello wow</p></div>`
+
+    const $ = cheerio.load(html)
+    const links = $('a[href]').toArray()
+    const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
+
+    const scoredPages = scoreLinks({
+      links,
+      articleUrl: url,
+      baseUrl: 'http://arstechnica.com',
+      $,
+    })
+
+    assert.equal(scoredPages, null)
+  })
+})
+
+describe('scoreBaseUrl(href, baseRegex)', () => {
+  it('returns -25 if url does not contain the base url', () => {
+    const baseUrl = 'http://example.com/foo/bar'
+    const badUrl = 'http://foo.com/foo/bar'
+    const baseRegex = makeBaseRegex(baseUrl)
+
+    assert.equal(scoreBaseUrl(badUrl, baseRegex), -25)
+  })
+
+  it('returns 0 if url contains the base url', () => {
+    const baseUrl = 'http://example.com/foo/bar'
+    const badUrl = 'http://example.com/foo/bar/bat'
+    const baseRegex = makeBaseRegex(baseUrl)
+
+    assert.equal(scoreBaseUrl(badUrl, baseRegex), 0)
+  })
+})
+
+describe('scoreNextLinkText(linkData)', () => {
+  it('returns 50 if contains common next link text', () => {
+    const linkData = "foo bar Next page"
+
+    assert.equal(scoreNextLinkText(linkData), 50)
+  })
+
+  it('returns 0 if does not contain common next link text', () => {
+    const linkData = "foo bar WOW GREAT"
+
+    assert.equal(scoreNextLinkText(linkData), 0)
+  })
+})
+
+describe('scoreCapLinks(linkData)', () => {
+  it('returns -65 if cap link with next link text', () => {
+    const linkData = "foo next Last page"
+
+    assert.equal(scoreCapLinks(linkData), -65)
+  })
+
+  it('returns 0 if does not match a cap link', () => {
+    const linkData = "foo bar WOW GREAT"
+
+    assert.equal(scoreCapLinks(linkData), 0)
+  })
+})
+
+describe('scorePrevLink(linkData)', () => {
+  it('returns -200 if link matches previous text', () => {
+    const linkData = "foo next previous page"
+
+    assert.equal(scorePrevLink(linkData), -200)
+  })
+
+  it('returns 0 if does not match a prev link', () => {
+    const linkData = "foo bar WOW GREAT"
+
+    assert.equal(scoreCapLinks(linkData), 0)
+  })
+})
+
+describe('scoreByParents($link)', () => {
+  it('returns 25 if parent sig looks like a page', () => {
+    const html = `
+      <div>
+        <div class="next-page">
+          <a href="blah">Next page</a>
+        </div>
+      </div>
+    `
+    const $ = cheerio.load(html)
+    const $link = $('a').first()
+
+    assert.equal(scoreByParents($link), 25)
+  })
+
+  it('returns -25 if parent sig looks like a comment', () => {
+    const html = `
+      <div>
+        <div class="comment">
+          <a href="blah">Next page</a>
+        </div>
+      </div>
+    `
+    const $ = cheerio.load(html)
+    const $link = $('a').first()
+
+    assert.equal(scoreByParents($link), -25)
+  })
+
+})
+
+describe('scoreExtraneousLinks(href)', () => {
+  it('returns -25 if link matches extraneous text', () => {
+    const url = "http://example.com/email-link"
+
+    assert.equal(scoreExtraneousLinks(url), -25)
+  })
+
+  it('returns 0 if does not match extraneous text', () => {
+    const url = "http://example.com/asdf"
+
+    assert.equal(scoreExtraneousLinks(url), 0)
+  })
+})
+
+describe('scorePageInLink(pageNum, isWp)', () => {
+  it('returns 50 if link contains a page num', () => {
+    assert.equal(scorePageInLink(1, false), 50)
+  })
+
+  it('returns 0 if link contains no page num', () => {
+    assert.equal(scorePageInLink(null, false), 0)
+  })
+
+  it('returns 0 if page is wordpress', () => {
+    assert.equal(scorePageInLink(10, true), 0)
+  })
+
+})
+
+describe('scoreLinkText(linkText)', () => {
+  it('returns 8 if link contains the num 2', () => {
+    assert.equal(scoreLinkText('2', 0), 8)
+  })
+
+  it('returns 5 if link contains the num 5', () => {
+    assert.equal(scoreLinkText('5', 0), 5)
+  })
+
+  it('returns -30 if link contains the number 1', () => {
+    assert.equal(scoreLinkText('1', 0), -30)
+  })
+
+  it('penalizes -50 if pageNum is >= link text as num', () => {
+    assert.equal(scoreLinkText('4', 5), -44)
+  })
+
+})
+
+describe('scoreSimilarity(score, articleUrl, href)', () => {
+  it('returns a similarity bonus based on current score', () => {
+    const articleUrl = 'http://example.com/foo/bar'
+    const href = 'http://example.com/foo/bar/2'
+    const score = 25
+    assert.equal(
+      Math.round(scoreSimilarity(score, articleUrl, href)),
+      66
+    )
+  })
+
+  it('returns 0 is current score <= 0', () => {
+    const articleUrl = 'http://example.com/foo/bar'
+    const href = 'http://example.com/foo/bar/2'
+    const score = 0
+    assert.equal(scoreSimilarity(score, articleUrl, href), 0)
+  })
+
+})
+
+describe('shouldScore(href, articleUrl, baseUrl, parsedUrl, linkText, previousUrls)', () => {
+  it('returns false if href has already been fetched', () => {
+    const previousUrls = [ 'http://example.com/foo/bar/2' ]
+    const href = 'http://example.com/foo/bar/2'
+    const parsedUrl = URL.parse(href)
+
+    assert.equal(
+      shouldScore(href, '', '', parsedUrl, '', previousUrls),
+      false
+    )
+  })
+
+  it('returns true if href has not been fetched', () => {
+    const previousUrls = [ 'http://example.com/foo/bar' ]
+    const href = 'http://example.com/foo/bar/2'
+    const parsedUrl = URL.parse(href)
+
+    assert.equal(
+      shouldScore(href, '', '', parsedUrl, '', previousUrls),
+      true
+    )
+  })
+
+})
--- a/src/extractors/root-extractor.js
+++ b/src/extractors/root-extractor.js
@ -7,7 +7,7 @@ import { ATTR_RE } from './constants'

 const RootExtractor = {
  extract(extractor=GenericExtractor, opts) {
-    const { $ } = opts
+    const { $, contentOnly, extractedTitle } = opts
    // This is the generic extractor. Run its extract method
    if (extractor.domain === '*') return extractor.extract(opts)

@ -16,23 +16,33 @@ const RootExtractor = {
      extractor
    }

-    const title = extract({ ...opts, type: 'title' })
-    const datePublished = extract({ ...opts, type: 'datePublished' })
-    const author = extract({ ...opts, type: 'author' })
-    const content = extract({
-      ...opts, type: 'content', extractHtml: true, title
-    })
-    const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', content })
-    const dek = extract({ ...opts, type: 'dek', content })
-
-    return {
-      title,
-      content,
-      author,
-      datePublished,
-      leadImageUrl,
-      dek,
+    if (contentOnly) {
+      const content = extract({
+        ...opts, type: 'content', extractHtml: true, title: extractedTitle
+      })
+      return {
+        content
+      }
+    } else {
+      const title = extract({ ...opts, type: 'title' })
+      const datePublished = extract({ ...opts, type: 'datePublished' })
+      const author = extract({ ...opts, type: 'author' })
+      const nextPageUrl = extract({ ...opts, type: 'nextPageUrl' })
+      const content = extract({
+        ...opts, type: 'content', extractHtml: true, title
+      })
+      const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', content })
+      const dek = extract({ ...opts, type: 'dek', content })
+      return {
+        title,
+        content,
+        author,
+        datePublished,
+        leadImageUrl,
+        dek,
+      }
    }
+
  }
 }

--- a/src/iris.js
+++ b/src/iris.js
@ -3,10 +3,12 @@ import fs from 'fs'
 import Resource from 'resource'
 import getExtractor from 'extractors/get-extractor'
 import RootExtractor from 'extractors/root-extractor'
+import { removeAnchor } from 'utils/text'

 const Iris = {
-  parse: async function(url, html) {
-    const $ = await Resource.create(url, html)
+  parse: async function(url, html, opts={}) {
+    const { fetchAllPages=true } = opts || true
+    let $ = await Resource.create(url, html)
    html = $.html()

    const Extractor = getExtractor(url)
@ -18,9 +20,61 @@ const Iris = {
      return $(node).attr('name')
    }).toArray()

-    const result = RootExtractor.extract(Extractor, { url, html, $, metaCache })
+    let extractorOpts = { url, html, $, metaCache }
+    let result = RootExtractor.extract(Extractor, extractorOpts)
+    let { nextPageUrl, title } = result
+
+    if (fetchAllPages && nextPageUrl) {
+      result = await collectAllPages({ nextPageUrl, html, $, metaCache, result, Extractor, title, url })
+    }
+
    return result
  }
 }

+async function collectAllPages({
+  nextPageUrl,
+  html,
+  $,
+  metaCache,
+  result,
+  Extractor,
+  title,
+  url
+}) {
+  let pages = 2
+  let previousUrls = [removeAnchor(url)]
+  while (nextPageUrl && pages < 26) {
+    $ = await Resource.create(nextPageUrl)
+    html = $.html()
+    let extractorOpts = { url: nextPageUrl, html, $, metaCache }
+    let nextPageResult = RootExtractor.extract(
+      Extractor,
+      {
+        ...extractorOpts,
+        url: nextPageUrl,
+        contentOnly: true,
+        extractedTitle: title,
+        previousUrls
+      }
+    )
+
+    previousUrls.push(nextPageUrl)
+    result = {
+      ...result,
+      content: `
+        ${result.content}
+        <hr>
+        <h4>Page ${pages}</h4>
+        ${nextPageResult.content}
+        `
+    }
+
+    nextPageUrl = nextPageResult.nextPageUrl
+
+    pages = pages + 1
+  }
+  return result
+}
+
 export default Iris
--- a/src/iris.test.js
+++ b/src/iris.test.js
@ -29,5 +29,18 @@ describe('Iris', function() {
      // console.log(result)
    })

+    it('does ars pagination', async function() {
+      const url = 'http://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'
+      const result = await Iris.parse(
+        url,
+        null,
+        { fetchAllPages: true }
+      )
+
+      // console.log(result)
+      assert.equal(result.nextPageUrl, `${url}2`)
+      // console.log(result.content)
+    })
+
  })
 })
--- a/src/utils/dom/constants.js
+++ b/src/utils/dom/constants.js
@ -301,7 +301,7 @@ export const NEGATIVE_SCORE_HINTS = [
 export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')

 // XPath to try to determine if a page is wordpress. Not always successful.
-export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
+export const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]'

 // Match a digit. Pretty clear.
 export const DIGIT_RE = new RegExp('[0-9]')
--- a/src/utils/dom/index.js
+++ b/src/utils/dom/index.js
@ -19,3 +19,4 @@ export { default as extractFromSelectors } from './extract-from-selectors'
 export { default as stripTags } from './strip-tags'
 export { default as withinComment } from './within-comment'
 export { default as nodeIsSufficient } from './node-is-sufficient'
+export { default as isWordpress } from './is-wordpress'
--- a/src/utils/dom/is-wordpress.js
+++ b/src/utils/dom/is-wordpress.js
@ -0,0 +1,5 @@
+import { IS_WP_SELECTOR } from './constants'
+
+export default function isWordpress($) {
+  return $(IS_WP_SELECTOR).length > 0
+}
--- a/src/utils/dom/is-wordpress.test.js
+++ b/src/utils/dom/is-wordpress.test.js
@ -0,0 +1,43 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import isWordpress from './is-wordpress'
+
+describe('isWordpress($)', () => {
+  it('returns false if a site is not generated by wordpress', () => {
+    const html = `
+      <html>
+        <head>
+          <meta name="generator" value="whatever">
+        <head>
+      </html>
+    `
+    let $ = cheerio.load(html)
+
+    assert.equal(isWordpress($), false)
+
+    const html2 = `
+      <html>
+        <head>
+          <meta name="foo" value="bar">
+        <head>
+      </html>
+    `
+    $ = cheerio.load(html)
+
+    assert.equal(isWordpress($), false)
+  })
+
+  it('returns true if a site is generated by wordpress', () => {
+    const html = `
+      <html>
+        <head>
+          <meta name="generator" value="WordPress 4.7-alpha-38592">
+        <head>
+      </html>
+    `
+    const $ = cheerio.load(html)
+
+    assert.equal(isWordpress($), true)
+  })
+})
--- a/src/utils/index.js
+++ b/src/utils/index.js
@ -0,0 +1 @@
+export { default as range } from './range'
--- a/src/utils/range.js
+++ b/src/utils/range.js
@ -0,0 +1,5 @@
+export default function* range(start = 1, end = 1) {
+  while (start <= end) {
+    yield start++
+  }
+}
--- a/src/utils/text/article-base-url.js
+++ b/src/utils/text/article-base-url.js
@ -0,0 +1,75 @@
+import URL from 'url'
+import {
+  HAS_ALPHA_RE,
+  IS_ALPHA_RE,
+  IS_DIGIT_RE,
+  PAGE_IN_HREF_RE,
+} from './constants'
+
+// Take a URL, and return the article base of said URL. That is, no
+// pagination data exists in it. Useful for comparing to other links
+// that might have pagination data within them.
+export default function articleBaseUrl(url, parsedUrl) {
+  parsedUrl = parsedUrl || URL.parse(url)
+  const { protocol, host, path } = parsedUrl
+
+  let firstSegmentHasLetters = false
+  const cleanedSegments = path.split('/')
+  .reverse()
+  .reduce((acc, segment, index) => {
+    // Split off and save anything that looks like a file type.
+    if (segment.includes('.')) {
+      const [ possibleSegment, fileExt ] = segment.split('.')
+      if (IS_ALPHA_RE.test(fileExt)) {
+        segment = possibleSegment
+      }
+    }
+
+    // If our first or second segment has anything looking like a page
+    // number, remove it.
+    if (PAGE_IN_HREF_RE.test(segment) && index < 2) {
+      segment = segment.replace(PAGE_IN_HREF_RE, '')
+    }
+
+    // If we're on the first segment, check to see if we have any
+    // characters in it. The first segment is actually the last bit of
+    // the URL, and this will be helpful to determine if we're on a URL
+    // segment that looks like "/2/" for example.
+    if (index === 0) {
+      firstSegmentHasLetters = HAS_ALPHA_RE.test(segment)
+    }
+
+    // If it's not marked for deletion, push it to cleaned_segments.
+    if (isGoodSegment(segment, index, firstSegmentHasLetters)) {
+      acc.push(segment)
+    }
+
+    return acc
+  }, [])
+
+  return `${protocol}//${host}${cleanedSegments.reverse().join('/')}`
+}
+
+function isGoodSegment(segment, index, firstSegmentHasLetters) {
+  let goodSegment = true
+
+  // If this is purely a number, and it's the first or second
+  // url_segment, it's probably a page number. Remove it.
+  if (index < 2 && IS_DIGIT_RE.test(segment) && segment.length < 3) {
+    goodSegment = true
+  }
+
+  // If this is the first url_segment and it's just "index",
+  // remove it
+  if (index === 0 && segment.toLowerCase() === 'index') {
+    goodSegment = false
+  }
+
+  // If our first or second url_segment is smaller than 3 characters,
+  // and the first url_segment had no alphas, remove it.
+  if (index < 2 && segment.length < 3 && !firstSegmentHasLetters) {
+    goodSegment = false
+  }
+
+  return goodSegment
+}
--- a/src/utils/text/article-base-url.test.js
+++ b/src/utils/text/article-base-url.test.js
@ -0,0 +1,21 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import articleBaseUrl from './article-base-url'
+
+describe('articleBaseUrl(url, parsedUrl)', () => {
+  it('returns the base url of a paginated url', () => {
+    const url = "http://example.com/foo/bar/wow-cool/page=10"
+    const cleaned = "http://example.com/foo/bar/wow-cool"
+
+    assert.equal(articleBaseUrl(url), cleaned)
+  })
+
+  it('returns same url if url has no pagination info', () => {
+    const url = "http://example.com/foo/bar/wow-cool/"
+    const cleaned = "http://example.com/foo/bar/wow-cool"
+
+    assert.equal(articleBaseUrl(url), cleaned)
+  })
+})
+
--- a/src/utils/text/constants.js
+++ b/src/utils/text/constants.js
@ -0,0 +1,22 @@
+// An expression that looks to try to find the page digit within a URL, if
+// it exists.
+// Matches:
+//  page=1
+//  pg=1
+//  p=1
+//  paging=12
+//  pag=7
+//  pagination/1
+//  paging/88
+//  pa/83
+//  p/11
+//
+// Does not match:
+//  pg=102
+//  page:2
+export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)))?(=|\/)([0-9]{1,3})', 'i')
+
+export const HAS_ALPHA_RE = /[a-z]/i
+
+export const IS_ALPHA_RE = /^[a-z]+$/i
+export const IS_DIGIT_RE = /^[0-9]+$/i
--- a/src/utils/text/index.js
+++ b/src/utils/text/index.js
@ -1,3 +1,6 @@
 export { default as normalizeSpaces } from './normalize-spaces'
 export { default as extractFromUrl } from './extract-from-url'
+export { default as pageNumFromUrl } from './page-num-from-url'
+export { default as removeAnchor } from './remove-anchor'
+export { default as articleBaseUrl } from './article-base-url'

--- a/src/utils/text/page-num-from-url.js
+++ b/src/utils/text/page-num-from-url.js
@ -0,0 +1,12 @@
+import { PAGE_IN_HREF_RE } from './constants'
+
+export default function pageNumFromUrl(url) {
+  const matches = url.match(PAGE_IN_HREF_RE)
+  if (!matches) return null
+
+  const pageNum = parseInt(matches[6])
+
+  // Return pageNum < 100, otherwise
+  // return null
+  return pageNum < 100 ? pageNum : null
+}
--- a/src/utils/text/page-num-from-url.test.js
+++ b/src/utils/text/page-num-from-url.test.js
@ -0,0 +1,45 @@
+import assert from 'assert'
+
+import pageNumFromUrl from './page-num-from-url'
+
+describe('pageNumFromUrl(url)', () => {
+  it('returns null if there is no page num in the url', () => {
+    const url1 = "http://example.com"
+    assert.equal(pageNumFromUrl(url1), null)
+
+    const url2 = "http://example.com/?pg=102"
+    assert.equal(pageNumFromUrl(url2), null)
+
+    const url3 = "http://example.com/?page:102"
+    assert.equal(pageNumFromUrl(url3), null)
+  })
+
+  it('returns a page num if one matches the url', () => {
+    const url1 = "http://example.com/foo?page=1"
+    assert.equal(pageNumFromUrl(url1), 1)
+
+    const url2 = "http://example.com/foo?pg=1"
+    assert.equal(pageNumFromUrl(url2), 1)
+
+    const url3 = "http://example.com/foo?p=1"
+    assert.equal(pageNumFromUrl(url3), 1)
+
+    const url4 = "http://example.com/foo?paging=1"
+    assert.equal(pageNumFromUrl(url4), 1)
+
+    const url5 = "http://example.com/foo?pag=1"
+    assert.equal(pageNumFromUrl(url5), 1)
+
+    const url6 = "http://example.com/foo?pagination/1"
+    assert.equal(pageNumFromUrl(url6), 1)
+
+    const url7 = "http://example.com/foo?paging/88"
+    assert.equal(pageNumFromUrl(url7), 88)
+
+    const url8 = "http://example.com/foo?pa/88"
+    assert.equal(pageNumFromUrl(url8), 88)
+
+    const url9 = "http://example.com/foo?p/88"
+    assert.equal(pageNumFromUrl(url9), 88)
+  })
+})
--- a/src/utils/text/remove-anchor.js
+++ b/src/utils/text/remove-anchor.js
@ -0,0 +1,3 @@
+export default function removeAnchor(url) {
+  return url.split('#')[0].replace(/\/$/, '')
+}
--- a/src/utils/text/remove-anchor.test.js
+++ b/src/utils/text/remove-anchor.test.js
@ -0,0 +1,21 @@
+import assert from 'assert'
+
+import removeAnchor from './remove-anchor'
+
+describe('removeAnchor(url)', () => {
+  it('returns a url w/out #anchor', () => {
+    const url = "http://example.com/foo/bar/wow-cool/page=10/#wow"
+    const cleaned = "http://example.com/foo/bar/wow-cool/page=10"
+
+    assert.equal(removeAnchor(url), cleaned)
+  })
+
+  it('returns same url if url has no anchor found', () => {
+    const url = "http://example.com/foo/bar/wow-cool"
+    const cleaned = "http://example.com/foo/bar/wow-cool"
+
+    assert.equal(removeAnchor(url), cleaned)
+  })
+})
+
+