chore: code reorganization

Squashed commit of the following: commit 636296841d5cf5e685237fe70db7a15305d8e966 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 9 13:37:21 2016 -0400 final cleanup commit 51f712b3074d41a1f2da91519289d4dd09719ad0 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 9 13:25:28 2016 -0400 Another big pass commit 3860e6d872a9adb9290093fd9c8708dfcc773c28 Author: Adam Pash <adam.pash@gmail.com> Date: Fri Sep 9 12:49:52 2016 -0400 chore: started reorganizing
8 years ago · c48e3485c0
parent f2729a5ee6
commit c48e3485c0
123 changed files with 514 additions and 125 deletions
--- a/.babelrc
+++ b/.babelrc
@ -3,6 +3,13 @@
  "plugins": [
    "transform-es2015-destructuring",
    "transform-object-rest-spread",
-    "transform-async-to-generator"
+    "transform-async-to-generator",
+    ["module-alias", [
+      { "src": "./src/utils", "expose": "utils" },
+      { "src": "./src/cleaners", "expose": "cleaners" },
+      { "src": "./src/resource", "expose": "resource" },
+      { "src": "./src/extractors", "expose": "extractors" },
+      { "src": "./src/test-helpers.js", "expose": "test-helpers" }
+    ]]
  ]
 }
--- a/TODO.md
+++ b/TODO.md
@ -1,6 +1,5 @@
 TODO:
 - remove logic for fetching meta attrs with custom props
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
 - extractNextPageUrl
 - Rename all cleaners from cleanThing to clean
 - Make sure weightNodes flag is being passed properly
@ -19,6 +18,7 @@ x extract and generalize cleaners
  x move arguments to cleaners to object
 x Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
 x extractLeadImageUrl
+x Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
 x extractDek
 x extractDatePublished
 x Title metadata
--- a/package.json
+++ b/package.json
@ -12,6 +12,7 @@
  "license": "ISC",
  "devDependencies": {
    "babel-plugin-external-helpers": "^6.8.0",
+    "babel-plugin-module-alias": "^1.6.0",
    "babel-plugin-transform-async-to-generator": "^6.8.0",
    "babel-plugin-transform-es2015-destructuring": "^6.9.0",
    "babel-plugin-transform-object-rest-spread": "^6.8.0",
--- a/src/cleaners/content.js
+++ b/src/cleaners/content.js
@ -8,9 +8,9 @@ import {
  rewriteTopLevel,
  stripJunkTags,
  makeLinksAbsolute,
-} from '../utils/dom'
+} from 'utils/dom'

-import { convertNodeTo } from '../extractor/utils/dom'
+import { convertNodeTo } from 'utils/dom'

 // Clean our article content, returning a new, cleaned node.
 export default function extractCleanNode(
--- a/src/cleaners/content.test.js
+++ b/src/cleaners/content.test.js
@ -3,7 +3,7 @@ import cheerio from 'cheerio'
 import fs from 'fs'

 import extractCleanNode from './content'
-import extractBestNode from '../extractor/generic/content/extract-best-node'
+import extractBestNode from 'extractors/generic/content/extract-best-node'

 describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
  it("cleans cruft out of a DOM node", () => {
--- a/src/cleaners/dek.js
+++ b/src/cleaners/dek.js
@ -1,5 +1,5 @@
 import { TEXT_LINK_RE } from './constants'
-import { stripTags } from '../extractor/utils/dom'
+import { stripTags } from 'utils/dom'

 // Take a dek HTML fragment, and return the cleaned version of it.
 // Return None if the dek wasn't good enough.
--- a/src/cleaners/title.js
+++ b/src/cleaners/title.js
@ -1,6 +1,6 @@
 import { TITLE_SPLITTERS_RE } from './constants'
 import { resolveSplitTitle } from './index'
-import { stripTags } from '../extractor/utils/dom'
+import { stripTags } from 'utils/dom'

 export default function cleanTitle(title, { url, $ }) {
  // If title has |, :, or - in it, see if
--- a/src/extractor/generic/content/utils/dom/index.js
+++ b/src/extractor/generic/content/utils/dom/index.js
@ -1,5 +0,0 @@
-// DOM manipulation
-export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
-export { default as brsToPs } from './brs-to-ps'
-export { default as paragraphize } from './paragraphize'
-export { convertToParagraphs, convertNodeTo } from './convert-to-paragraphs'
--- a/src/extractor/generic/content/utils/dom/test-helpers.js
+++ b/src/extractor/generic/content/utils/dom/test-helpers.js
@ -1,17 +0,0 @@
-import assert from 'assert'
-import cheerio from 'cheerio'
-import HTML from '../fixtures/html'
-
-export function clean(string) {
-  return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
-}
-
-export function assertBeforeAndAfter(key, fn) {
-  const $ = cheerio.load(HTML[key].before)
-  assert.equal(clean(fn($).html()), clean(HTML[key].after))
-}
-
-export function assertClean(a, b) {
-  assert.equal(clean(a), clean(b))
-}
-
--- a/src/extractor/generic/utils/index.js
+++ b/src/extractor/generic/utils/index.js
@ -1,3 +0,0 @@
-export { default as extractFromMeta } from './extract-from-meta'
-export { default as extractFromSelectors } from './extract-from-selectors'
-export { default as extractFromUrl } from './extract-from-url'
--- a/src/extractor/utils/dom/index.js
+++ b/src/extractor/utils/dom/index.js
@ -1,4 +0,0 @@
-export { default as withinComment } from './within-comment'
-export { default as convertNodeTo } from './convert-node-to'
-export { default as stripTags } from './strip-tags'
-
--- a/src/extractor/utils/index.js
+++ b/src/extractor/utils/index.js
@ -1 +0,0 @@
-export { default as nodeIsSufficient } from './node-is-sufficient'
--- a/src/extractor/utils/index.test.js
+++ b/src/extractor/utils/index.test.js
--- a/src/extractors/all.js
+++ b/src/extractors/all.js
--- a/src/extractors/constants.js
+++ b/src/extractors/constants.js
--- a/src/extractors/custom/blogspot.com/index.js
+++ b/src/extractors/custom/blogspot.com/index.js
--- a/src/extractors/custom/nymag.com/fixtures/test.html
+++ b/src/extractors/custom/nymag.com/fixtures/test.html
--- a/src/extractors/custom/nymag.com/index.js
+++ b/src/extractors/custom/nymag.com/index.js
--- a/src/extractors/custom/wikipedia.org/index.js
+++ b/src/extractors/custom/wikipedia.org/index.js
--- a/src/extractors/generic/author/constants.js
+++ b/src/extractors/generic/author/constants.js
--- a/src/extractors/generic/author/extractor.js
+++ b/src/extractors/generic/author/extractor.js
@ -5,12 +5,12 @@ import {
  BYLINE_SELECTORS_RE,
 } from './constants'

-import { cleanAuthor } from '../../../cleaners'
+import { cleanAuthor } from 'cleaners'

 import {
  extractFromMeta,
  extractFromSelectors
-} from '../utils'
+} from 'utils/dom'

 const GenericAuthorExtractor = {
  extract({ $, metaCache }) {
--- a/src/extractors/generic/author/extractor.test.js
+++ b/src/extractors/generic/author/extractor.test.js
--- a/src/extractors/generic/author/fixtures/html.js
+++ b/src/extractors/generic/author/fixtures/html.js
--- a/src/extractors/generic/content/extract-best-node.js
+++ b/src/extractors/generic/content/extract-best-node.js
@ -5,7 +5,7 @@ import {
 import {
  stripUnlikelyCandidates,
  convertToParagraphs,
-} from './utils/dom'
+} from 'utils/dom'

 // Using a variety of scoring techniques, extract the content most
 // likely to be article text.
--- a/src/extractors/generic/content/extract-best-node.test.js
+++ b/src/extractors/generic/content/extract-best-node.test.js
--- a/src/extractors/generic/content/extractor.js
+++ b/src/extractors/generic/content/extractor.js
@ -2,9 +2,9 @@ import cheerio from 'cheerio'
 import 'babel-polyfill'

 import extractBestNode from './extract-best-node'
-import nodeIsSufficient from '../../utils/node-is-sufficient'
-import { cleanContent } from '../../../cleaners'
-import { normalizeSpaces } from '../../../utils/text'
+import { nodeIsSufficient } from 'utils/dom'
+import { cleanContent } from 'cleaners'
+import { normalizeSpaces } from 'utils/text'

 const GenericContentExtractor = {
  defaultOpts: {
--- a/src/extractors/generic/content/extractor.test.js
+++ b/src/extractors/generic/content/extractor.test.js
@ -2,7 +2,7 @@ import assert from 'assert'
 import cheerio from 'cheerio'
 import fs from 'fs'

-import { clean } from './utils/dom/test-helpers'
+import { clean } from 'test-helpers'

 import GenericContentExtractor from './extractor'

--- a/src/extractors/generic/content/utils/constants.js
+++ b/src/extractors/generic/content/utils/constants.js
--- a/src/extractor/generic/content/utils/dom/node-is-sufficient.js
+++ b/src/extractor/generic/content/utils/dom/node-is-sufficient.js
--- a/src/extractors/generic/content/utils/fixtures/html.js
+++ b/src/extractors/generic/content/utils/fixtures/html.js
--- a/src/extractors/generic/content/utils/index.js
+++ b/src/extractors/generic/content/utils/index.js
--- a/src/extractors/generic/content/utils/scoring/add-score.js
+++ b/src/extractors/generic/content/utils/scoring/add-score.js
--- a/src/extractors/generic/content/utils/scoring/add-score.test.js
+++ b/src/extractors/generic/content/utils/scoring/add-score.test.js
--- a/src/extractors/generic/content/utils/scoring/add-to-parent.js
+++ b/src/extractors/generic/content/utils/scoring/add-to-parent.js
--- a/src/extractors/generic/content/utils/scoring/add-to-parent.test.js
+++ b/src/extractors/generic/content/utils/scoring/add-to-parent.test.js
--- a/src/extractors/generic/content/utils/scoring/find-top-candidate.js
+++ b/src/extractors/generic/content/utils/scoring/find-top-candidate.js
@ -3,7 +3,7 @@ import { getScore } from './index'
 import {
  textLength,
  linkDensity
-} from '../../../../../utils/dom'
+} from 'utils/dom'

 // After we've calculated scores, loop through all of the possible
 // candidate nodes we found and find the one with the highest score.
--- a/src/extractors/generic/content/utils/scoring/find-top-candidate.test.js
+++ b/src/extractors/generic/content/utils/scoring/find-top-candidate.test.js
--- a/src/extractors/generic/content/utils/scoring/fixtures/html.js
+++ b/src/extractors/generic/content/utils/scoring/fixtures/html.js
--- a/src/extractors/generic/content/utils/scoring/get-or-init-score.js
+++ b/src/extractors/generic/content/utils/scoring/get-or-init-score.js
--- a/src/extractors/generic/content/utils/scoring/get-or-init-score.test.js
+++ b/src/extractors/generic/content/utils/scoring/get-or-init-score.test.js
--- a/src/extractors/generic/content/utils/scoring/get-score.js
+++ b/src/extractors/generic/content/utils/scoring/get-score.js
--- a/src/extractors/generic/content/utils/scoring/get-score.test.js
+++ b/src/extractors/generic/content/utils/scoring/get-score.test.js
--- a/src/extractors/generic/content/utils/scoring/get-weight.js
+++ b/src/extractors/generic/content/utils/scoring/get-weight.js
--- a/src/extractors/generic/content/utils/scoring/get-weight.test.js
+++ b/src/extractors/generic/content/utils/scoring/get-weight.test.js
--- a/src/extractors/generic/content/utils/scoring/index.js
+++ b/src/extractors/generic/content/utils/scoring/index.js
--- a/src/extractors/generic/content/utils/scoring/score-commas.js
+++ b/src/extractors/generic/content/utils/scoring/score-commas.js
--- a/src/extractors/generic/content/utils/scoring/score-commas.test.js
+++ b/src/extractors/generic/content/utils/scoring/score-commas.test.js
--- a/src/extractors/generic/content/utils/scoring/score-content.js
+++ b/src/extractors/generic/content/utils/scoring/score-content.js
@ -8,7 +8,7 @@ import {
  addScore,
 } from './index'

-import { convertNodeTo } from '../../../../utils/dom'
+import { convertNodeTo } from 'utils/dom'

 // score content. Parents get the full value of their children's
 // content score, grandparents half
--- a/src/extractors/generic/content/utils/scoring/score-content.test.js
+++ b/src/extractors/generic/content/utils/scoring/score-content.test.js
@ -2,7 +2,7 @@ import assert from 'assert'
 import cheerio from 'cheerio'
 import fs from 'fs'

-import { clean } from '../dom/test-helpers'
+import { clean } from 'test-helpers'
 import HTML from './fixtures/html'

 import {
--- a/src/extractors/generic/content/utils/scoring/score-length.js
+++ b/src/extractors/generic/content/utils/scoring/score-length.js
--- a/src/extractors/generic/content/utils/scoring/score-length.test.js
+++ b/src/extractors/generic/content/utils/scoring/score-length.test.js
--- a/src/extractors/generic/content/utils/scoring/score-node.js
+++ b/src/extractors/generic/content/utils/scoring/score-node.js
--- a/src/extractors/generic/content/utils/scoring/score-node.test.js
+++ b/src/extractors/generic/content/utils/scoring/score-node.test.js
--- a/src/extractors/generic/content/utils/scoring/score-paragraph.js
+++ b/src/extractors/generic/content/utils/scoring/score-paragraph.js
--- a/src/extractors/generic/content/utils/scoring/score-paragraph.test.js
+++ b/src/extractors/generic/content/utils/scoring/score-paragraph.test.js
--- a/src/extractors/generic/content/utils/scoring/set-score.js
+++ b/src/extractors/generic/content/utils/scoring/set-score.js
--- a/src/extractors/generic/content/utils/scoring/set-score.test.js
+++ b/src/extractors/generic/content/utils/scoring/set-score.test.js
--- a/src/extractors/generic/date-published/constants.js
+++ b/src/extractors/generic/date-published/constants.js
--- a/src/extractors/generic/date-published/extractor.js
+++ b/src/extractors/generic/date-published/extractor.js
@ -4,13 +4,13 @@ import {
  DATE_PUBLISHED_URL_RES,
 } from './constants'

-import { cleanDatePublished } from '../../../cleaners'
+import { cleanDatePublished } from 'cleaners'

 import {
  extractFromMeta,
  extractFromSelectors,
-  extractFromUrl,
-} from '../utils'
+} from 'utils/dom'
+import { extractFromUrl } from 'utils/text'

 const GenericDatePublishedExtractor = {
  extract({ $, url, metaCache }) {
--- a/src/extractors/generic/date-published/extractor.test.js
+++ b/src/extractors/generic/date-published/extractor.test.js
--- a/src/extractors/generic/date-published/fixtures/html.js
+++ b/src/extractors/generic/date-published/fixtures/html.js
--- a/src/extractors/generic/dek/constants.js
+++ b/src/extractors/generic/dek/constants.js
--- a/src/extractors/generic/dek/extractor.js
+++ b/src/extractors/generic/dek/extractor.js
@ -4,13 +4,12 @@ import {
  DEK_URL_RES,
 } from './constants'

-import { cleanDek } from '../../../cleaners'
+import { cleanDek } from 'cleaners'

 import {
  extractFromMeta,
  extractFromSelectors,
-  extractFromUrl,
-} from '../utils'
+} from 'utils/dom'

 // Currently there is only one selector for
 // deks. We should simply return null here
--- a/src/extractors/generic/dek/extractor.test.js
+++ b/src/extractors/generic/dek/extractor.test.js
--- a/src/extractors/generic/index.js
+++ b/src/extractors/generic/index.js
--- a/src/extractors/generic/index.test.js
+++ b/src/extractors/generic/index.test.js
@ -1,7 +1,7 @@
 import assert from 'assert'
 import fs from 'fs'

-import { clean } from './content/utils/dom/test-helpers'
+import { clean } from 'test-helpers'

 import GenericExtractor from './index'

--- a/src/extractors/generic/lead-image-url/constants.js
+++ b/src/extractors/generic/lead-image-url/constants.js
--- a/src/extractors/generic/lead-image-url/extractor.js
+++ b/src/extractors/generic/lead-image-url/extractor.js
@ -8,7 +8,7 @@ import {
 import {
  extractFromMeta,
  extractFromSelectors
-} from '../utils'
+} from 'utils/dom'

 import {
  scoreImageUrl,
@ -19,7 +19,7 @@ import {
  scoreByPosition,
 } from './score-image'

-import { cleanImage } from '../../../cleaners'
+import { cleanImage } from 'cleaners'

 // Given a resource, try to find the lead image URL from within
 // it. Like content and next page extraction, uses a scoring system
--- a/src/extractors/generic/lead-image-url/extractor.test.js
+++ b/src/extractors/generic/lead-image-url/extractor.test.js
--- a/src/extractors/generic/lead-image-url/fixtures/html.js
+++ b/src/extractors/generic/lead-image-url/fixtures/html.js
--- a/src/extractors/generic/lead-image-url/score-image.js
+++ b/src/extractors/generic/lead-image-url/score-image.js
--- a/src/extractors/generic/lead-image-url/score-image.test.js
+++ b/src/extractors/generic/lead-image-url/score-image.test.js
--- a/src/extractors/generic/title/constants.js
+++ b/src/extractors/generic/title/constants.js
--- a/src/extractors/generic/title/extractor.js
+++ b/src/extractors/generic/title/extractor.js
@ -4,11 +4,11 @@ import {
  STRONG_TITLE_SELECTORS,
  WEAK_TITLE_SELECTORS
 } from './constants'
-import { cleanTitle } from '../../../cleaners'
+import { cleanTitle } from 'cleaners'
 import {
  extractFromMeta,
  extractFromSelectors
-} from '../utils'
+} from 'utils/dom'

 const GenericTitleExtractor = {
  extract({ $, url, metaCache }) {
--- a/src/extractors/generic/title/extractor.test.js
+++ b/src/extractors/generic/title/extractor.test.js
--- a/src/extractors/generic/title/fixtures/html.js
+++ b/src/extractors/generic/title/fixtures/html.js
--- a/src/extractors/generic/title/utils/index.js
+++ b/src/extractors/generic/title/utils/index.js
--- a/src/extractors/get-extractor.js
+++ b/src/extractors/get-extractor.js
--- a/src/extractors/get-extractor.test.js
+++ b/src/extractors/get-extractor.test.js
--- a/src/extractors/index.js
+++ b/src/extractors/index.js
--- a/src/extractors/root-extractor.js
+++ b/src/extractors/root-extractor.js
@ -1,8 +1,8 @@
 import 'babel-polyfill'

 import GenericExtractor from './generic'
-import Cleaners from '../cleaners'
-import { convertNodeTo, stripTags } from './utils/dom'
+import Cleaners from 'cleaners'
+import { convertNodeTo, stripTags } from 'utils/dom'
 import { ATTR_RE } from './constants'

 const RootExtractor = {
--- a/src/extractors/root-extractor.test.js
+++ b/src/extractors/root-extractor.test.js
@ -15,7 +15,7 @@ import NYMagExtractor from './custom/nymag.com'
 describe('RootExtractor', () => {
  it('extracts based on custom selectors', () => {
    const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
-    const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
+    const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8')
    const $ = cheerio.load(html)

    const {
--- a/src/iris.js
+++ b/src/iris.js
@ -1,10 +1,8 @@
 import fs from 'fs'

-import Resource from './resource'
-import getExtractor from './extractor/get-extractor'
-import RootExtractor from './extractor/root-extractor'
-
-import fetchResource from './resource/utils/fetch-resource'
+import Resource from 'resource'
+import getExtractor from 'extractors/get-extractor'
+import RootExtractor from 'extractors/root-extractor'

 const Iris = {
  parse: async function(url, html) {
--- a/src/utils/dom/test-helpers.js
+++ b/src/utils/dom/test-helpers.js
@ -1,16 +1,10 @@
 import assert from 'assert'
 import cheerio from 'cheerio'
-import HTML from './fixtures/html'

 export function clean(string) {
  return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
 }

-export function assertBeforeAndAfter(key, fn) {
-  const $ = cheerio.load(HTML[key].before)
-  assert.equal(clean(fn($).html()), clean(HTML[key].after))
-}
-
 export function assertClean(a, b) {
  assert.equal(clean(a), clean(b))
 }
--- a/src/extractor/generic/content/utils/dom/brs-to-ps.js
+++ b/src/extractor/generic/content/utils/dom/brs-to-ps.js
--- a/src/extractor/generic/content/utils/dom/brs-to-ps.test.js
+++ b/src/extractor/generic/content/utils/dom/brs-to-ps.test.js
@ -1,11 +1,14 @@
 import assert from 'assert'
 import cheerio from 'cheerio'

-import { assertBeforeAndAfter } from './test-helpers'
-import HTML from '../fixtures/html'
-import {
-  brsToPs
-} from './index'
+import { assertClean } from 'test-helpers'
+import HTML from './fixtures/html'
+import brsToPs from './brs-to-ps'
+
+function assertBeforeAndAfter(key, fn) {
+  const $ = cheerio.load(HTML[key].before)
+  assertClean(fn($).html(), HTML[key].after)
+}

 describe('Generic Extractor Utils', () => {
  describe('brsToPs(node)', () => {
--- a/src/utils/dom/clean-attributes.test.js
+++ b/src/utils/dom/clean-attributes.test.js
@ -2,7 +2,7 @@ import cheerio from 'cheerio'
 import assert from 'assert'

 import HTML from './fixtures/html'
-import { assertClean } from './test-helpers'
+import { assertClean } from 'test-helpers'

 import { cleanAttributes } from './index'

--- a/src/utils/dom/clean-h-ones.js
+++ b/src/utils/dom/clean-h-ones.js
@ -1,4 +1,4 @@
-import { convertNodeTo } from '../../extractor/utils/dom'
+import { convertNodeTo } from 'utils/dom'

 // H1 tags are typically the article title, which should be extracted
 // by the title extractor instead. If there's less than 3 of them (<3),
--- a/src/utils/dom/clean-h-ones.test.js
+++ b/src/utils/dom/clean-h-ones.test.js
@ -2,7 +2,7 @@ import assert from 'assert'
 import cheerio from 'cheerio'

 import HTML from './fixtures/html'
-import { assertClean } from './test-helpers'
+import { assertClean } from 'test-helpers'

 import { cleanHOnes } from './index'

--- a/src/utils/dom/clean-headers.js
+++ b/src/utils/dom/clean-headers.js
@ -1,6 +1,6 @@
 import { HEADER_TAG_LIST } from './constants'
 import { normalizeSpaces } from '../text'
-import { getWeight } from '../../extractor/generic/content/utils/scoring'
+import { getWeight } from 'extractors/generic/content/utils/scoring'

 export default function cleanHeaders(article, $, title='') {
  $(HEADER_TAG_LIST, article).each((index, header) => {
--- a/src/utils/dom/clean-headers.test.js
+++ b/src/utils/dom/clean-headers.test.js
@ -2,7 +2,7 @@ import assert from 'assert'
 import cheerio from 'cheerio'

 import HTML from './fixtures/html'
-import { assertClean } from './test-helpers'
+import { assertClean } from 'test-helpers'

 import { cleanHeaders } from './index'

--- a/src/utils/dom/clean-images.test.js
+++ b/src/utils/dom/clean-images.test.js
@ -2,7 +2,7 @@ import assert from 'assert'
 import cheerio from 'cheerio'

 import HTML from './fixtures/html'
-import { assertClean } from './test-helpers'
+import { assertClean } from 'test-helpers'

 import { cleanImages } from './index'

--- a/src/utils/dom/clean-tags.js
+++ b/src/utils/dom/clean-tags.js
@ -4,7 +4,7 @@ import {
  setScore,
  getOrInitScore,
  scoreCommas,
-} from '../../extractor/generic/content/utils/scoring'
+} from 'extractors/generic/content/utils/scoring'

 import { normalizeSpaces } from '../text'

--- a/src/utils/dom/clean-tags.test.js
+++ b/src/utils/dom/clean-tags.test.js
@ -2,7 +2,7 @@ import assert from 'assert'
 import cheerio from 'cheerio'

 import HTML from './fixtures/html'
-import { assertClean } from './test-helpers'
+import { assertClean } from 'test-helpers'

 import { cleanTags } from './index'

--- a/src/utils/dom/constants.js
+++ b/src/utils/dom/constants.js
@ -28,3 +28,411 @@ export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(',')
 // cleanHeaders
 const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']
 export const HEADER_TAG_LIST = HEADER_TAGS.join(',')
+
+
+
+
+//// CONTENT FETCHING CONSTANTS ////
+
+// A list of strings that can be considered unlikely candidates when
+// extracting content from a resource. These strings are joined together
+// and then tested for existence using re:test, so may contain simple,
+// non-pipe style regular expression queries if necessary.
+export const UNLIKELY_CANDIDATES_BLACKLIST = [
+    'ad-break',
+    'adbox',
+    'advert',
+    'addthis',
+    'agegate',
+    'aux',
+    'blogger-labels',
+    'combx',
+    'comment',
+    'conversation',
+    'disqus',
+    'entry-unrelated',
+    'extra',
+    'foot',
+    'form',
+    'header',
+    'hidden',
+    'loader',
+    'login',                     // Note: This can hit 'blogindex'.
+    'menu',
+    'meta',
+    'nav',
+    'pager',
+    'pagination',
+    'predicta',                  // readwriteweb inline ad box
+    'presence_control_external', // lifehacker.com container full of false positives
+    'popup',
+    'printfriendly',
+    'related',
+    'remove',
+    'remark',
+    'rss',
+    'share',
+    'shoutbox',
+    'sidebar',
+    'sociable',
+    'sponsor',
+    'tools'
+]
+
+// A list of strings that can be considered LIKELY candidates when
+// extracting content from a resource. Essentially, the inverse of the
+// blacklist above - if something matches both blacklist and whitelist,
+// it is kept. This is useful, for example, if something has a className
+// of "rss-content entry-content". It matched 'rss', so it would normally
+// be removed, however, it's also the entry content, so it should be left
+// alone.
+//
+// These strings are joined together and then tested for existence using
+// re:test, so may contain simple, non-pipe style regular expression queries
+// if necessary.
+export const UNLIKELY_CANDIDATES_WHITELIST = [
+    'and',
+    'article',
+    'body',
+    'blogindex',
+    'column',
+    'content',
+    'entry-content-asset',
+    'format', // misuse of form
+    'hfeed',
+    'hentry',
+    'hatom',
+    'main',
+    'page',
+    'posts',
+    'shadow'
+]
+
+// A list of tags which, if found inside, should cause a <div /> to NOT
+// be turned into a paragraph tag. Shallow div tags without these elements
+// should be turned into <p /> tags.
+export const DIV_TO_P_BLOCK_TAGS = [
+    'a',
+    'blockquote',
+    'dl',
+    'div',
+    'img',
+    'p',
+    'pre',
+    'table',
+].join(',')
+
+// A list of tags that should be ignored when trying to find the top candidate
+// for a document.
+export const NON_TOP_CANDIDATE_TAGS = [
+    'br',
+    'b',
+    'i',
+    'label',
+    'hr',
+    'area',
+    'base',
+    'basefont',
+    'input',
+    'img',
+    'link',
+    'meta',
+]
+
+export const NON_TOP_CANDIDATE_TAGS_RE =
+  new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i')
+
+// A list of selectors that specify, very clearly, either hNews or other
+// very content-specific style content, like Blogger templates.
+// More examples here: http://microformats.org/wiki/blog-post-formats
+export const HNEWS_CONTENT_SELECTORS = [
+  ['.hentry', '.entry-content'],
+  ['entry', '.entry-content'],
+  ['.entry', '.entry_content'],
+  ['.post', '.postbody'],
+  ['.post', '.post_body'],
+  ['.post', '.post-body'],
+]
+// export const HNEWS_CONTENT_SELECTORS = [
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'),
+//         must_exist: {
+//             classes: ['hentry', 'entry-content'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'),
+//         must_exist: {
+//             classes: ['entry', 'entry-content'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'),
+//         must_exist: {
+//             classes: ['entry', 'entry_content'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'),
+//         must_exist: {
+//             classes: ['post', 'post-body'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'),
+//         must_exist: {
+//             classes: ['post', 'post_body'],
+//         }
+//     },
+//     {
+//         //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'),
+//         must_exist: {
+//             classes: ['post', 'postbody'],
+//         }
+//     },
+// ]
+
+export const PHOTO_HINTS = [
+    'figure',
+    'photo',
+    'image',
+    'caption'
+]
+export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
+
+
+// A list of strings that denote a positive scoring for this content as being
+// an article container. Checked against className and id.
+//
+// TODO: Perhaps have these scale based on their odds of being quality?
+export const POSITIVE_SCORE_HINTS = [
+    'article', 
+    'articlecontent',
+    'instapaper_body',
+    'blog',
+    'body',
+    'content',
+    'entry-content-asset',
+    'entry',
+    'hentry',
+    'main',
+    'Normal',
+    'page',
+    'pagination',
+    'permalink',
+    'post',
+    'story',
+    'text',
+    '[-_]copy', //usatoday
+    '\Bcopy'
+]
+
+// The above list, joined into a matching regular expression
+export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
+
+// Readability publisher-specific guidelines
+export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i')
+
+// A list of strings that denote a negative scoring for this content as being
+// an article container. Checked against className and id.
+//
+// TODO: Perhaps have these scale based on their odds of being quality?
+export const NEGATIVE_SCORE_HINTS = [
+    'adbox',
+    'advert',
+    'author',
+    'bio',
+    'bookmark',
+    'bottom',
+    'byline',
+    'clear',
+    'com-',
+    'combx',
+    'comment',
+    'comment\B',
+    'contact',
+    'copy',
+    'credit',
+    'crumb',
+    'date',
+    'deck',
+    'excerpt',
+    'featured', //tnr.com has a featured_content which throws us off
+    'foot',
+    'footer',
+    'footnote',
+    'graf',
+    'head',
+    'info',
+    'infotext', //newscientist.com copyright
+    'instapaper_ignore',
+    'jump',
+    'linebreak',
+    'link',
+    'masthead',
+    'media',
+    'meta',
+    'modal',
+    'outbrain', //slate.com junk
+    'promo',
+    'pr_', // autoblog - press release
+    'related',
+    'respond',
+    'roundcontent', //lifehacker restricted content warning
+    'scroll',
+    'secondary',
+    'share',
+    'shopping',
+    'shoutbox',
+    'side',
+    'sidebar',
+    'sponsor',
+    'stamp',
+    'sub',
+    'summary',
+    'tags',
+    'tools',
+    'widget'
+]
+// The above list, joined into a matching regular expression
+export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
+
+// XPath to try to determine if a page is wordpress. Not always successful.
+export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
+
+// Match a digit. Pretty clear.
+export const DIGIT_RE = new RegExp('[0-9]')
+
+// A list of words that, if found in link text or URLs, likely mean that
+// this link is not a next page link.
+export const EXTRANEOUS_LINK_HINTS = [
+    'print',
+    'archive',
+    'comment',
+    'discuss',
+    'e-mail',
+    'email',
+    'share',
+    'reply',
+    'all',
+    'login',
+    'sign',
+    'single',
+    'adx',
+    'entry-unrelated'
+]
+export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
+
+// An expression that looks to try to find the page digit within a URL, if
+// it exists.
+// Matches:
+//  page=1
+//  pg=1
+//  p=1
+//  paging=12
+//  pag=7
+//  pagination/1
+//  paging/88
+//  pa/83
+//  p/11
+//
+// Does not match:
+//  pg=102
+//  page:2
+// DISABLING FOR NOW TODO AP
+// export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
+
+// Match any phrase that looks like it could be page, or paging, or pagination
+export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
+
+// Match any link text/classname/id that looks like it could mean the next
+// page. Things like: next, continue, >, >>, » but not >|, »| as those can
+// mean last page.
+export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
+
+// Match any link text/classname/id that looks like it is an end link: things
+// like "first", "last", "end", etc.
+export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
+
+// Match any link text/classname/id that looks like it means the previous
+// page.
+export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
+
+// Match 2 or more consecutive <br> tags
+export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i')
+
+// Match 1 BR tag.
+export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i')
+
+// A list of all of the block level tags known in HTML5 and below. Taken from
+// http://bit.ly/qneNIT
+export const BLOCK_LEVEL_TAGS = [
+    'article',
+    'aside',
+    'blockquote',
+    'body',
+    'br',
+    'button',
+    'canvas',
+    'caption',
+    'col',
+    'colgroup',
+    'dd',
+    'div',
+    'dl',
+    'dt',
+    'embed',
+    'fieldset',
+    'figcaption',
+    'figure',
+    'footer',
+    'form',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'header',
+    'hgroup',
+    'hr',
+    'li',
+    'map',
+    'object',
+    'ol',
+    'output',
+    'p',
+    'pre',
+    'progress',
+    'section',
+    'table',
+    'tbody',
+    'textarea',
+    'tfoot',
+    'th',
+    'thead',
+    'tr',
+    'ul',
+    'video',
+]
+export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i')
+
+
+// The removal is implemented as a blacklist and whitelist, this test finds
+// blacklisted elements that aren't whitelisted. We do this all in one
+// expression-both because it's only one pass, and because this skips the
+// serialization for whitelisted nodes.
+const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|')
+export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i')
+
+const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|')
+export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i')
+
+export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i')
+
+
+export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i')
+export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i')
+export const BAD_TAGS = new RegExp('^(address|form)$', 'i')
+
+export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i')
--- a/src/extractor/utils/dom/convert-node-to.js
+++ b/src/extractor/utils/dom/convert-node-to.js
@ -2,4 +2,3 @@ export default function convertNodeTo(node, $, tag='p') {
  $(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
  return $
 }
-
--- a/src/extractor/utils/dom/convert-node-to.test.js
+++ b/src/extractor/utils/dom/convert-node-to.test.js
--- a/src/extractor/generic/content/utils/dom/convert-to-paragraphs.js
+++ b/src/extractor/generic/content/utils/dom/convert-to-paragraphs.js
@ -1,7 +1,7 @@
-import { convertNodeTo } from '../../../../utils/dom'
+import { convertNodeTo } from 'utils/dom'

 import { brsToPs } from './index'
-import { DIV_TO_P_BLOCK_TAGS } from '../constants'
+import { DIV_TO_P_BLOCK_TAGS } from './constants'
 // Loop through the provided doc, and convert any p-like elements to
 // actual paragraph tags.
 //
@ -14,7 +14,7 @@ import { DIV_TO_P_BLOCK_TAGS } from '../constants'
 //   :return cheerio object with new p elements
 //   (By-reference mutation, though. Returned just for convenience.)

-export function convertToParagraphs($) {
+export default function convertToParagraphs($) {
  $ = brsToPs($)
  $ = convertDivs($)
  $ = convertSpans($)
--- a/src/extractor/generic/content/utils/dom/convert-to-paragraphs.test.js
+++ b/src/extractor/generic/content/utils/dom/convert-to-paragraphs.test.js
@ -1,15 +1,15 @@
 import assert from 'assert'
 import cheerio from 'cheerio'

-import {
-  assertBeforeAndAfter,
-  assertClean
-} from './test-helpers'
-import HTML from '../fixtures/html'
-
-import {
-  convertToParagraphs
-} from './index'
+import { assertClean } from 'test-helpers'
+import HTML from './fixtures/html'
+
+import convertToParagraphs from './convert-to-paragraphs'
+
+function assertBeforeAndAfter(key, fn) {
+  const $ = cheerio.load(HTML[key].before)
+  assertClean(fn($).html(), HTML[key].after)
+}

 describe('Generic Extractor Utils', () => {
  describe('convertToParagraphs($)', () => {
--- a/src/extractor/generic/utils/extract-from-meta.js
+++ b/src/extractor/generic/utils/extract-from-meta.js
@ -1,4 +1,4 @@
-import { stripTags } from '../../utils/dom'
+import { stripTags } from 'utils/dom'

 // Given a node type to search for, and a list of meta tag names to
 // search for, find a meta tag associated.
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`export { default as nodeIsSufficient } from './node-is-sufficient'`