refactor: cleaners now run on custom extractors

Squashed commit of the following:

commit e4c7d1d149d1846f0d589b3653655b81b477c682
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 8 19:29:26 2016 -0400

    refactor: cleaners now run on custom extractors

commit ca08d2482c54bf6a40f50758da9353f00987a4d7
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 8 14:42:19 2016 -0400

    moved cleaners, refactored as necessary

commit ec2c5d36410b255c6d8ee264deca990c46709c3c
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 8 14:07:01 2016 -0400

    moved datePublished cleaner

commit 5e55e397eecb3e88d64cd2aa2c6071c9cffed272
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 8 13:34:21 2016 -0400

    moved dek cleaner

commit 2dfb0c44d7882336992fdc864792df6eac094c21
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 8 13:29:37 2016 -0400

    moved lead-image-url

commit cef7a213b80ddd671249225622f1388f9e68896c
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 8 13:26:20 2016 -0400

    moved author
pull/1/head
Adam Pash 8 years ago
parent 603682239d
commit 91881df523

@ -1,6 +1,4 @@
TODO:
- extract and generalize cleaners
- get custom datePublished selector to convert to date object (prob through cleaner)
- run makeLinksAbsolute on extracted content before returning
- remove logic for fetching meta attrs with custom props
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
@ -14,6 +12,9 @@ TODO:
- Separate constants into activity-specific folders (dom, scoring)
DONE:
x get custom datePublished selector to convert to date object (prob through cleaner)
x extract and generalize cleaners
x move arguments to cleaners to object
x Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
x extractLeadImageUrl
x extractDek

@ -1,6 +1,6 @@
import assert from 'assert'
import cleanAuthor from './clean-author'
import cleanAuthor from './author'
describe('cleanAuthor(author)', () => {
it('removes the By from an author string', () => {

@ -0,0 +1,39 @@
// CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct.
//
// NOTE: There are currently no meta tags that seem to provide the right
// content consistenty enough. Two options were:
// - og:description
// - dc.description
// However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [
]
// An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit.
//
// Should be more restrictive than not, as a failed dek can be pretty
// detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [
'.entry-summary',
]
// CLEAN DATE PUBLISHED CONSTANTS
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i
export const TIME_MERIDIAN_RE = /(.*\d)(am|pm)(.*)/i
export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}[ap]?m?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig
// CLEAN TITLE CONSTANTS
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g
export const DOMAIN_ENDINGS_RE =
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g')

@ -1,18 +1,18 @@
import {
rewriteTopLevel,
cleanImages,
stripJunkTags,
cleanHOnes,
cleanAttributes,
cleanHeaders,
cleanHOnes,
cleanImages,
cleanTags,
cleanAttributes,
removeEmpty,
} from './utils/dom'
rewriteTopLevel,
stripJunkTags,
} from '../utils/dom'
import { convertNodeTo } from '../../utils/dom'
import { convertNodeTo } from '../extractor/utils/dom'
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(article, $, cleanConditionally=true, title='') {
export default function extractCleanNode(article, { $, cleanConditionally=true, title='' }) {
// do I need to copy/clone?
// Can't I just start over w/fresh html if I need to?
// Look into this

@ -2,10 +2,10 @@ import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import extractCleanNode from './extract-clean-node'
import extractBestNode from './extract-best-node'
import extractCleanNode from './content'
import extractBestNode from '../extractor/generic/content/extract-best-node'
describe('extractCleanNode(article, $, { cleanConditionally })', () => {
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it("cleans cruft out of a DOM node", () => {
const html = fs.readFileSync('./fixtures/wired.html', 'utf-8')
let $ = cheerio.load(html)
@ -20,7 +20,7 @@ describe('extractCleanNode(article, $, { cleanConditionally })', () => {
let result = $.html(bestNode)
// console.log(result)
// console.log(result.length)
const cleanNode = extractCleanNode(bestNode, $, opts)
const cleanNode = extractCleanNode(bestNode, { $, opts })
result = $.html(cleanNode)
// console.log(result.length)
// console.log(result)

@ -5,6 +5,7 @@ import moment from 'moment'
import {
CLEAN_DATE_STRING_RE,
SPLIT_DATE_STRING,
TIME_MERIDIAN_RE
} from './constants'
@ -15,11 +16,13 @@ export default function cleanDatePublished(dateString) {
const date = moment(new Date(dateString))
return date.isValid() ? date : null
return date.isValid() ? date.toISOString() : null
}
export function cleanDateString(dateString) {
return dateString.replace(CLEAN_DATE_STRING_RE, '$1')
return (dateString.match(SPLIT_DATE_STRING) || [])
.join(' ')
.replace(CLEAN_DATE_STRING_RE, '$1')
.replace(TIME_MERIDIAN_RE, '$1 $2 $3')
.trim()
}

@ -3,14 +3,14 @@ import assert from 'assert'
import {
default as cleanDatePublished,
cleanDateString,
} from './clean-date-published'
} from './date-published'
describe('cleanDatePublished(dateString)', () => {
it('returns a date object', () => {
const datePublished = cleanDatePublished('published: 1/1/2020')
assert.equal(
datePublished.toISOString(),
datePublished,
new Date('1/1/2020').toISOString()
)
})
@ -46,5 +46,15 @@ describe('cleanDateString(dateString)', () => {
assert.equal(date2, '8:30 PM 1/1/2020')
})
it('can handle some tough timestamps', () => {
// The JS date parser is forgiving, but
// it needs am/pm separated from a time
const date1 = cleanDateString('This page was last modified on 15 April 2016, at 10:59.')
assert.equal(date1, '15 Apr 2016 10:59')
const date2 = cleanDateString('8:30PM 1/1/2020')
assert.equal(date2, '8:30 PM 1/1/2020')
})
})

@ -1,9 +1,9 @@
import { TEXT_LINK_RE } from './constants'
import { stripTags } from '../../utils/dom'
import { stripTags } from '../extractor/utils/dom'
// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.
export default function cleanDek(dek, $) {
export default function cleanDek(dek, { $ }) {
// Sanity check that we didn't get too short or long of a dek.
if (dek.length > 1000 || dek.length < 5) return null

@ -4,12 +4,12 @@ import cheerio from 'cheerio'
import {
default as cleanDek,
cleanDekString,
} from './clean-dek'
} from './dek'
describe('cleanDek(dekString, $)', () => {
describe('cleanDek(dekString, { $ })', () => {
it('returns null if the dek is < 5 chars', () => {
const $ = cheerio.load('<div></div>')
assert.equal(cleanDek('Hi', $), null)
assert.equal(cleanDek('Hi', { $ }), null)
})
it('returns null if the dek is > 1000 chars', () => {
@ -19,34 +19,34 @@ describe('cleanDek(dekString, $)', () => {
[0,1,2,3,4,5,6].reduce((acc, i) =>
acc += acc, '0123456789'
)
assert.equal(cleanDek(longDek, $), null)
assert.equal(cleanDek(longDek, { $ }), null)
})
it('strip html tags from the dek', () => {
const $ = cheerio.load('<div></div>')
const dek = 'This is a <em>very</em> important dek.'
assert.equal(cleanDek(dek, $), 'This is a very important dek.')
assert.equal(cleanDek(dek, { $ }), 'This is a very important dek.')
})
it('returns null if dek contains plain text link', () => {
const $ = cheerio.load('<div></div>')
const dek = 'This has this link http://example.com/foo/bar'
assert.equal(cleanDek(dek, $), null)
assert.equal(cleanDek(dek, { $ }), null)
})
it('returns a normal dek as is', () => {
const $ = cheerio.load('<div></div>')
const dek = 'This is the dek'
assert.equal(cleanDek(dek, $), dek)
assert.equal(cleanDek(dek, { $ }), dek)
})
it('cleans extra whitespace', () => {
const $ = cheerio.load('<div></div>')
const dek = ' This is the dek '
assert.equal(cleanDek(dek, $), 'This is the dek')
assert.equal(cleanDek(dek, { $ }), 'This is the dek')
})
})

@ -0,0 +1,26 @@
import cleanAuthor from './author'
import cleanImage from './lead-image-url'
import cleanDek from './dek'
import cleanDatePublished from './date-published'
import cleanContent from './content'
import cleanTitle from './title'
const Cleaners = {
author: cleanAuthor,
leadImageUrl: cleanImage,
dek: cleanDek,
datePublished: cleanDatePublished,
content: cleanContent,
title: cleanTitle,
}
export default Cleaners
export { cleanAuthor }
export { cleanImage }
export { cleanDek }
export { cleanDatePublished }
export { cleanContent }
export { cleanTitle }
export { default as resolveSplitTitle } from './resolve-split-title'

@ -1,6 +1,6 @@
import assert from 'assert'
import clean from './clean'
import clean from './lead-image-url'
describe('clean(leadImageUrl)', () => {
it('returns the url if valid', () => {

@ -5,7 +5,7 @@ import wuzzy from 'wuzzy'
import {
TITLE_SPLITTERS_RE,
DOMAIN_ENDINGS_RE,
} from '../constants'
} from './constants'
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.

@ -1,8 +1,8 @@
import { TITLE_SPLITTERS_RE } from '../constants'
import { TITLE_SPLITTERS_RE } from './constants'
import { resolveSplitTitle } from './index'
import { stripTags } from '../../../utils/dom'
import { stripTags } from '../extractor/utils/dom'
export default function cleanTitle(title, url, $) {
export default function cleanTitle(title, { url, $ }) {
// If title has |, :, or - in it, see if
// we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) {
@ -10,7 +10,8 @@ export default function cleanTitle(title, url, $) {
}
// Final sanity check that we didn't get a crazy title.
if (title.length > 150 || title.length < 15) {
// if (title.length > 150 || title.length < 15) {
if (title.length > 150) {
// If we did, return h1 from the document if it exists
const h1 = $('h1')
if (h1.length === 1) {
@ -21,3 +22,4 @@ export default function cleanTitle(title, url, $) {
// strip any html tags in the title text
return stripTags(title, $).trim()
}

@ -4,33 +4,33 @@ import cheerio from 'cheerio'
import HTML from './fixtures/html'
import { cleanTitle } from './index'
describe('cleanTitle(title, $)', () => {
describe('cleanTitle(title, { url, $ })', () => {
it('uses a single h1 if the title is too short or too long', () => {
const title = "Too Short"
const $ = cheerio.load(HTML.docWithH1)
assert.equal(cleanTitle(title, '', $), $('h1').text())
// const title = "Too Short"
// const $ = cheerio.load(HTML.docWithH1)
//
// assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
})
it('only uses h1 if there is only one on the page', () => {
const title = "Too Short"
const $ = cheerio.load(HTML.docWith2H1s)
assert.equal(cleanTitle(title, '', $), title)
assert.equal(cleanTitle(title, { url: '', $ }), title)
})
it('removes HTML tags from titles', () => {
const $ = cheerio.load(HTML.docWithTagsInH1.before)
const title = $('h1').html()
assert.equal(cleanTitle(title, '', $), HTML.docWithTagsInH1.after)
assert.equal(cleanTitle(title, { url: '', $ }), HTML.docWithTagsInH1.after)
})
it('trims extraneous spaces', () => {
const title = " This Is a Great Title That You'll Love "
const $ = cheerio.load(HTML.docWithTagsInH1.before)
assert.equal(cleanTitle(title, '', $), title.trim())
assert.equal(cleanTitle(title, { url: '', $ }), title.trim())
})
})

@ -55,6 +55,3 @@ export const BYLINE_SELECTORS_RE = [
['#byline', byline_re],
['.byline', byline_re],
]
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',

@ -5,7 +5,7 @@ import {
BYLINE_SELECTORS_RE,
} from './constants'
import cleanAuthor from './clean-author'
import { cleanAuthor } from '../../../cleaners'
import {
extractFromMeta,

@ -3,8 +3,8 @@ import 'babel-polyfill'
import extractBestNode from './extract-best-node'
import nodeIsSufficient from '../../utils/node-is-sufficient'
import extractCleanNode from './extract-clean-node'
import { normalizeSpaces } from './utils/text'
import { cleanContent } from '../../../cleaners'
import { normalizeSpaces } from '../../../utils/text'
const GenericContentExtractor = {
defaultOpts: {
@ -32,17 +32,20 @@ const GenericContentExtractor = {
//
// cleanConditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
extract({ $, html }, title='', opts) {
extract({ $, html, title }, opts) {
opts = { ...this.defaultOpts, ...opts }
$ = $ || cheerio.load(html)
// Cascade through our extraction-specific opts in an ordered fashion,
// turning them off as we try to extract content.
let node = extractCleanNode(
let node = cleanContent(
extractBestNode($, opts),
$,
opts.cleanConditionally)
{
$,
cleanConditionally: opts.cleanConditionally,
title
})
if (nodeIsSufficient(node)) {
return this.cleanAndReturnNode(node, $)
@ -53,7 +56,7 @@ const GenericContentExtractor = {
opts[key] = false
$ = cheerio.load(html)
node = extractCleanNode(
node = cleanContent(
extractBestNode($, opts),
$,
opts.cleanConditionally)

@ -263,19 +263,6 @@ export const NEGATIVE_SCORE_HINTS = [
// The above list, joined into a matching regular expression
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
// A list of tags to strip from the output if we encounter them.
export const STRIP_OUTPUT_TAGS = [
'title',
'script',
'noscript',
'link',
'style',
'hr',
]
// Spacer images to be removed
export const SPACER_RE = new RegExp("trans|transparent|spacer|blank", "i")
// XPath to try to determine if a page is wordpress. Not always successful.
export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
@ -415,16 +402,3 @@ export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i')
export const BAD_TAGS = new RegExp('^(address|form)$', 'i')
export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i')
export const REMOVE_ATTRS = ['style', 'align']
export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`)
export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',')
export const REMOVE_EMPTY_TAGS = ['p']
export const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',')
export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(',')
const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']
export const HEADER_TAG_LIST = HEADER_TAGS.join(',')

@ -2,13 +2,4 @@
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
export { default as brsToPs } from './brs-to-ps'
export { default as paragraphize } from './paragraphize'
export { default as rewriteTopLevel } from './rewrite-top-level'
export { default as cleanImages } from './clean-images'
export { default as stripJunkTags } from './strip-junk-tags'
export { default as cleanHOnes } from './clean-h-ones'
export { default as cleanAttributes } from './clean-attributes'
export { default as removeEmpty } from './remove-empty'
export { default as cleanTags } from './clean-tags'
export { default as cleanHeaders } from './clean-headers'
export { textLength, linkDensity } from './link-density'
export { convertToParagraphs, convertNodeTo } from './convert-to-paragraphs'

@ -1,9 +1,9 @@
import { NON_TOP_CANDIDATE_TAGS_RE } from '../constants'
import { getScore } from './index'
import {
linkDensity,
textLength
} from '../dom/index'
textLength,
linkDensity
} from '../../../../../utils/dom'
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.

@ -1,6 +1,3 @@
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i
export const TIME_MERIDIAN_RE = /(.*\d)(am|pm)(.*)/i
// An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.

@ -4,7 +4,7 @@ import {
DATE_PUBLISHED_URL_RES,
} from './constants'
import cleanDatePublished from './clean-date-published'
import { cleanDatePublished } from '../../../cleaners'
import {
extractFromMeta,

@ -15,7 +15,7 @@ describe('GenericDatePublishedExtractor', () => {
)
assert.equal(
result.toISOString(),
result,
HTML.datePublishedMeta.result.toISOString()
)
})
@ -29,7 +29,7 @@ describe('GenericDatePublishedExtractor', () => {
)
assert.equal(
result.toISOString(),
result,
HTML.datePublishedMeta.result.toISOString()
)
})
@ -44,7 +44,7 @@ describe('GenericDatePublishedExtractor', () => {
)
assert.equal(
result.toISOString(),
result,
new Date('2012/08/01').toISOString()
)
})
@ -59,8 +59,8 @@ describe('GenericDatePublishedExtractor', () => {
)
assert.equal(
result.toISOString(),
new Date('2020-01-01').toISOString()
result,
new Date('2020 01 01').toISOString()
)
})
@ -74,7 +74,7 @@ describe('GenericDatePublishedExtractor', () => {
)
assert.equal(
result.toISOString(),
result,
new Date('2020/jan/01').toISOString()
)
})

@ -1,23 +0,0 @@
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
// An ordered list of meta tag names that denote likely article deks.
// From most distinct to least distinct.
//
// NOTE: There are currently no meta tags that seem to provide the right
// content consistenty enough. Two options were:
// - og:description
// - dc.description
// However, these tags often have SEO-specific junk in them that's not
// header-worthy like a dek is. Excerpt material at best.
export const DEK_META_TAGS = [
]
// An ordered list of Selectors to find likely article deks. From
// most explicit to least explicit.
//
// Should be more restrictive than not, as a failed dek can be pretty
// detrimental to the aesthetics of an article.
export const DEK_SELECTORS = [
'.entry-summary',
]

@ -4,7 +4,7 @@ import {
DEK_URL_RES,
} from './constants'
import cleanDek from './clean-dek'
import { cleanDek } from '../../../cleaners'
import {
extractFromMeta,

@ -28,14 +28,14 @@ const GenericExtractor = {
const title = this.title(options)
const datePublished = this.datePublished(options)
const author = this.author(options)
const content = this.content(options)
const content = this.content({ ...options, title })
const leadImageUrl = this.leadImageUrl(options)
const dek = this.dek(options)
return {
title,
author,
datePublished: datePublished ? datePublished.toISOString() : null,
datePublished: datePublished ? datePublished : null,
dek,
leadImageUrl,
content,

@ -19,7 +19,7 @@ import {
scoreByPosition,
} from './score-image'
import clean from './clean'
import { cleanImage } from '../../../cleaners'
// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system
@ -46,7 +46,7 @@ const GenericLeadImageUrlExtractor = {
)
if (imageUrl) {
cleanUrl = clean(imageUrl)
cleanUrl = cleanImage(imageUrl)
if (cleanUrl) return cleanUrl
}
@ -79,7 +79,7 @@ const GenericLeadImageUrlExtractor = {
, [null, 0])
if (topScore > 0) {
cleanUrl = clean(topUrl)
cleanUrl = cleanImage(topUrl)
if (cleanUrl) return cleanUrl
}
@ -90,19 +90,19 @@ const GenericLeadImageUrlExtractor = {
const $node = $(selector).first()
const src = $node.attr('src')
if (src) {
cleanUrl = clean(src)
cleanUrl = cleanImage(src)
if (cleanUrl) return cleanUrl
}
const href = $node.attr('href')
if (href) {
cleanUrl = clean(href)
cleanUrl = cleanImage(href)
if (cleanUrl) return cleanUrl
}
const value = $node.attr('value')
if (value) {
cleanUrl = clean(value)
cleanUrl = cleanImage(value)
if (cleanUrl) return cleanUrl
}
}

@ -52,11 +52,3 @@ export const WEAK_TITLE_SELECTORS = [
'html head title',
'title',
]
// A regular expression that will match separating characters on a
// title, that usually denote breadcrumbs or something similar.
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g
export const DOMAIN_ENDINGS_RE =
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g')

@ -4,7 +4,7 @@ import {
STRONG_TITLE_SELECTORS,
WEAK_TITLE_SELECTORS
} from './constants'
import { cleanTitle } from './utils'
import { cleanTitle } from '../../../cleaners'
import {
extractFromMeta,
extractFromSelectors
@ -17,20 +17,20 @@ const GenericTitleExtractor = {
let title
title = extractFromMeta($, STRONG_TITLE_META_TAGS, metaCache)
if (title) return cleanTitle(title, url, $)
if (title) return cleanTitle(title, { url, $ })
// Second, look through our content selectors for the most likely
// article title that is strongly associated with the headline.
title = extractFromSelectors($, STRONG_TITLE_SELECTORS)
if (title) return cleanTitle(title, url, $)
if (title) return cleanTitle(title, { url, $ })
// Third, check for weaker meta tags that may match.
title = extractFromMeta($, WEAK_TITLE_META_TAGS, metaCache)
if (title) return cleanTitle(title, url, $)
if (title) return cleanTitle(title, { url, $ })
// Last, look for weaker selector tags that may match.
title = extractFromSelectors($, WEAK_TITLE_SELECTORS)
if (title) return cleanTitle(title, url, $)
if (title) return cleanTitle(title, { url, $ })
// If no matches, return an empty string
return ""

@ -1,6 +1,7 @@
import 'babel-polyfill'
import GenericExtractor from './generic'
import Cleaners from '../cleaners'
import { convertNodeTo, stripTags } from './utils/dom'
const RootExtractor = {
@ -17,7 +18,9 @@ const RootExtractor = {
const title = extract({ ...opts, type: 'title' })
const datePublished = extract({ ...opts, type: 'datePublished' })
const author = extract({ ...opts, type: 'author' })
const content = extract({ ...opts, type: 'content', extractHtml: true })
const content = extract({
...opts, type: 'content', extractHtml: true, title
})
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', content })
const dek = extract({ ...opts, type: 'dek', content })
@ -33,15 +36,16 @@ const RootExtractor = {
}
function extract(opts) {
const { type, extractor, $, extractHtml } = opts
const { type, extractor } = opts
// If nothing matches the selector,
// run the Generic extraction
return select($, extractor[type], extractHtml) ||
return select({ ...opts, extractionOpts: extractor[type] }) ||
GenericExtractor[type](opts)
}
function select($, extractionOpts, extractHtml=false) {
function select(opts) {
const { $, type, extractionOpts, extractHtml=false } = opts
// Skip if there's not extraction for this type
if (!extractionOpts) return
@ -71,7 +75,8 @@ function select($, extractionOpts, extractHtml=false) {
return $.html($content)
} else {
return stripTags($(matchingSelector).text(), $)
// return stripTags($(matchingSelector).text(), $)
return Cleaners[type]($(matchingSelector).text(), opts)
}
}

@ -2,7 +2,7 @@ import assert from 'assert'
import fs from 'fs'
import cheerio from 'cheerio'
import CustomExtractor from './root-extractor'
import RootExtractor from './root-extractor'
import {
cleanBySelectors,
transformElements
@ -11,7 +11,7 @@ import {
import GenericExtractor from './generic'
import NYMagExtractor from './custom/nymag.com'
describe('CustomExtractor', () => {
describe('RootExtractor', () => {
it('extracts based on custom selectors', () => {
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
@ -23,7 +23,7 @@ describe('CustomExtractor', () => {
author,
datePublished,
leadImageUrl,
} = CustomExtractor.extract(
} = RootExtractor.extract(
NYMagExtractor, { url, html, $, metaCache: [] }
)

@ -8,7 +8,8 @@ import {
} from './fetch-resource'
import { MAX_CONTENT_LENGTH } from './constants'
describe('fetchResource(url)', () => {
describe('fetchResource(url)', function() {
this.timeout(10000)
it('fetches domains', async () => {
const url = 'http://theconcourse.deadspin.com/1786177057'
const { body, response } = await fetchResource(url)

@ -2,7 +2,7 @@ import {
REMOVE_ATTR_SELECTORS,
REMOVE_ATTR_LIST,
REMOVE_ATTRS,
} from '../constants'
} from './constants'
// Remove attributes like style or align
export default function cleanAttributes(article, $) {

@ -1,7 +1,7 @@
import cheerio from 'cheerio'
import assert from 'assert'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { cleanAttributes } from './index'

@ -1,4 +1,4 @@
import { convertNodeTo } from '../../../../utils/dom'
import { convertNodeTo } from '../../extractor/utils/dom'
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { cleanHOnes } from './index'

@ -1,6 +1,6 @@
import { HEADER_TAG_LIST } from '../constants'
import { HEADER_TAG_LIST } from './constants'
import { normalizeSpaces } from '../text'
import { getWeight } from '../scoring'
import { getWeight } from '../../extractor/generic/content/utils/scoring'
export default function cleanHeaders(article, $, title='') {
$(HEADER_TAG_LIST, article).each((index, header) => {

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { cleanHeaders } from './index'

@ -1,4 +1,4 @@
import { SPACER_RE } from '../constants'
import { SPACER_RE } from './constants'
export default function cleanImages(article, $) {
$(article).find('img').each((index, img) => {

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { cleanImages } from './index'

@ -1,10 +1,11 @@
import { CLEAN_CONDITIONALLY_TAGS } from '../constants'
import { CLEAN_CONDITIONALLY_TAGS } from './constants'
import {
getScore,
setScore,
getOrInitScore,
scoreCommas,
} from '../scoring'
} from '../../extractor/generic/content/utils/scoring'
import { normalizeSpaces } from '../text'
import { linkDensity } from './index'

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { cleanTags } from './index'

@ -0,0 +1,28 @@
// Spacer images to be removed
export const SPACER_RE = new RegExp("trans|transparent|spacer|blank", "i")
// A list of tags to strip from the output if we encounter them.
export const STRIP_OUTPUT_TAGS = [
'title',
'script',
'noscript',
'link',
'style',
'hr',
]
// cleanAttributes
export const REMOVE_ATTRS = ['style', 'align']
export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`)
export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',')
// removeEmpty
export const REMOVE_EMPTY_TAGS = ['p']
export const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',')
// cleanTags
export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(',')
// cleanHeaders
const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']
export const HEADER_TAG_LIST = HEADER_TAGS.join(',')

@ -0,0 +1,664 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`
},
// cleanConditionally
dropNegativeScore: {
before: `
<div>
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
},
previousEndsInColon: {
before: `
<div weight="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: `What do you think?`
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
},
}
export default HTML

@ -0,0 +1,12 @@
export { default as cleanImages } from './clean-images'
export { default as stripJunkTags } from './strip-junk-tags'
export { default as cleanHOnes } from './clean-h-ones'
export { default as cleanAttributes } from './clean-attributes'
export { default as removeEmpty } from './remove-empty'
export { default as cleanTags } from './clean-tags'
export { default as cleanHeaders } from './clean-headers'
export { default as rewriteTopLevel } from './rewrite-top-level'
export { textLength, linkDensity } from './link-density'

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { linkDensity } from './index'

@ -1,4 +1,4 @@
import { REMOVE_EMPTY_SELECTORS } from '../constants'
import { REMOVE_EMPTY_SELECTORS } from './constants'
export default function removeEmpty(article, $) {
$(REMOVE_EMPTY_SELECTORS, article).remove()

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { removeEmpty } from './index'

@ -1,4 +1,4 @@
import { convertNodeTo } from '../../../../utils/dom'
import { convertNodeTo } from '../../extractor/utils/dom'
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { rewriteTopLevel } from './index'

@ -1,6 +1,6 @@
import {
STRIP_OUTPUT_TAGS
} from '../constants'
} from './constants'
export default function stripJunkTags(article, $) {
$(STRIP_OUTPUT_TAGS.join(','), article).remove()

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { stripJunkTags } from './index'

@ -0,0 +1,17 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
export function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
}
export function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before)
assert.equal(clean(fn($).html()), clean(HTML[key].after))
}
export function assertClean(a, b) {
assert.equal(clean(a), clean(b))
}

@ -0,0 +1,664 @@
const HTML = {
// getWeight fixtures
positiveId: `
<div id="entry">
<p>Ooo good one</p>
</div>
`,
negativeId: `
<div id="adbox">
<p>Ooo good one</p>
</div>
`,
positiveClass: `
<div class="entry">
<p>Ooo good one</p>
</div>
`,
negativeClass: `
<div id="comment ad">
<p>Ooo good one</p>
</div>
`,
positiveIdAndClass: `
<div id="article" class="entry">
<p>Ooo good one</p>
</div>
`,
positiveIdNegClass: `
<div id="article" class="adbox">
<p>Ooo good one</p>
</div>
`,
positivePhotoClass: `
<div class="figure">
<p>Ooo good one</p>
</div>
`,
positiveIdAndPhoto: `
<div id="article" class="figure">
<p>Ooo good one</p>
</div>
`,
entryContentAsset: `
<div id="foo" class="entry-content-asset">
<p>Ooo good one</p>
</div>
`,
// stripUnlikelyCandidates
noMatches: `
<div id="foo">
<p>Ooo good one</p>
</div>
`,
whitelistMatch: {
before: `
<div class="header">Stuff</div>
<div class="article">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article">
<p>Ooo good one</p>
</div>
`,
},
whiteAndBlack: {
before: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p>Ooo good one</p>
</div>
`,
},
whiteInsideBlack: {
before: `
<div>
<div class="adbox">
<div class="article">
<p>Ooo good one</p>
</div>
</div>
<div>Something unrelated</div>
</div>
`,
after: `
<div>
<div>Something unrelated</div>
</div>
`,
},
// brsToPs
singleBr: {
before: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<br>
<p>Ooo good one</p>
</div>
`,
},
doubleBrs: {
before: `
<div class="article adbox">
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
severalBrs: {
before: `
<div class="article adbox">
<br />
<br />
<br />
<br />
<br />
<p>Ooo good one</p>
</div>
`,
after: `
<div class="article adbox">
<p> </p><p>Ooo good one</p>
</div>
`,
},
brsInP: {
before: `
<p>
Here is some text
<br />
<br />
Here is more text
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p></p>
`,
},
paragraphize: {
before: `
<p>
Here is some text
<br />
Here is more text
<span>And also this</span>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
<span>And also this</span>
</p></p>
`,
},
paragraphizeBlock: {
before: `
<p>
Here is some text
<br />
Here is more text
<div>And also this</div>
</p>
`,
after: `
<p>
Here is some text
<p>
Here is more text
</p><div>And also this</div>
</p>
`,
},
// convertToParagraphs
convertToParagraphs: {
before: `
<p>
Here is some text
<span>This should remain in a p</span>
<br />
<br />
This should be wrapped in a p
<div>This should become a p</div>
</p>
<span>This should become a p</span>
`,
after: `
<p>
Here is some text
<span>This should remain in a p</span>
<p>
This should be wrapped in a p
</p><p>This should become a p</p>
</p> <p>This should become a p</p>
`,
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
`,
linkDensity1: `
<div><p><a href="">Some text!</a></p></div>
`,
linkDensity0: `
<div><p><a href=""></a></p></div>
`,
// rewriteTopLevel
rewriteHTMLBody: {
before: `
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
`,
after: `
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
`
},
// cleanImages
cleanSmallImages: {
before: `
<div>
<img width="5" height="5" />
<img width="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`
},
cleanHeight: {
before: `
<div>
<img width="50" height="50" />
</div>
`,
after: `
<div>
<img width="50">
</div>
`
},
cleanSpacer: {
before: `
<div>
<img src="/foo/bar/baz/spacer.png" />
<img src="/foo/bar/baz/normal.png" />
<p>Some text</p>
</div>
`,
after: `
<div>
<img src="/foo/bar/baz/normal.png">
<p>Some text</p>
</div>
`
},
// stripJunkTags
stripsJunk: {
before: `
<div>
<style>.red { color: 'red'; }</style>
<title>WOW</title>
<link rel="asdflkjawef" />
<p>What an article</p>
<script type="text/javascript">alert('hi!');</script>
<noscript>Don't got it</noscript>
<hr />
</div>
`,
after: `
<div>
<p>What an article</p>
</div>
`
},
// stripHOnes
removeTwoHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
convertThreeHOnes: {
before: `
<div>
<h1>Look at this!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
<p>What do you think?</p>
<h1>Can you believe it?!</h1>
</div>
`,
after: `
<div>
<h2>Look at this!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
<p>What do you think?</p>
<h2>Can you believe it?!</h2>
</div>
`
},
// cleanAttributes
removeStyle: {
before: `
<div>
<p style="color: red;">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
removeAlign: {
before: `
<div>
<p style="color: red;" align="center">What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
// removeEmpty
removeEmptyP: {
before: `
<div>
<p>What do you think?</p>
<p></p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
</div>
`
},
doNotRemoveBr: {
before: `
<div>
<p>What do you think?</p>
<p></p>
<div></div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div></div>
<p>What do you think?</p>
</div>
`
},
doNotNested: {
before: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p><iframe src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p><img src="foo/bar.jpg" /></p>
<p>What do you think?</p>
</div>
`
},
// cleanConditionally
dropNegativeScore: {
before: `
<div>
<p>What do you think?</p>
<p>
<ul score="-10">
<li>Foo</li>
<li>Bar</li>
</ul>
</p>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>
</p>
<p>What do you think?</p>
</div>
`
},
removeTooManyInputs: {
before: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<div>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
<p>What is your name?</p>
<input type="text"></input>
</div>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
},
removeShortNoImg: {
before: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf" />
</div>
<div>
<p>Lose this one</p>
</div>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<div>
<p>Keep this one</p>
<img src="asdf">
</div>
</div>
`
},
linkDensityHigh: {
before: `
<div score="0">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="20">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
},
goodScoreTooDense: {
before: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
after: `
<div>
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
<li>Keep this one</li>
</ul>
</div>
`
},
previousEndsInColon: {
before: `
<div weight="40">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<p>Now read these links: </p>
<ul score="30">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
cleanEntryContentAsset: {
before: `
<div score="100">
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
<ul score="20" class="entry-content-asset">
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
<li><a href="#">Lose this one</a></li>
</ul>
</div>
`,
},
// normalizeSpaces
normalizeSpaces: {
before: `
<div>
<p>What do you think?</p>
</div>
`,
after: `What do you think?`
},
// cleanHeaders
cleanFirstHeds: {
before: `
<div>
<h2>Lose me</h2>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<h2>Keep me</h2>
<p>What do you think?</p>
</div>
`
},
cleanTitleMatch: {
before: `
<div>
<p>What do you think?</p>
<h2>Title Match</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
},
dropWithNegativeWeight: {
before: `
<div>
<p>What do you think?</p>
<h2 class="advert">Bad Class, Bad Weight</h2>
<p>What do you think?</p>
</div>
`,
after: `
<div>
<p>What do you think?</p>
<p>What do you think?</p>
</div>
`
},
}
export default HTML

@ -1,7 +1,7 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
import HTML from './fixtures/html'
import { normalizeSpaces } from './index'
Loading…
Cancel
Save