refactor: cleaners now run on custom extractors
Squashed commit of the following: commit e4c7d1d149d1846f0d589b3653655b81b477c682 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 8 19:29:26 2016 -0400 refactor: cleaners now run on custom extractors commit ca08d2482c54bf6a40f50758da9353f00987a4d7 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 8 14:42:19 2016 -0400 moved cleaners, refactored as necessary commit ec2c5d36410b255c6d8ee264deca990c46709c3c Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 8 14:07:01 2016 -0400 moved datePublished cleaner commit 5e55e397eecb3e88d64cd2aa2c6071c9cffed272 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 8 13:34:21 2016 -0400 moved dek cleaner commit 2dfb0c44d7882336992fdc864792df6eac094c21 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 8 13:29:37 2016 -0400 moved lead-image-url commit cef7a213b80ddd671249225622f1388f9e68896c Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 8 13:26:20 2016 -0400 moved authorpull/1/head
parent
603682239d
commit
91881df523
@ -1,6 +1,6 @@
|
||||
import assert from 'assert'
|
||||
|
||||
import cleanAuthor from './clean-author'
|
||||
import cleanAuthor from './author'
|
||||
|
||||
describe('cleanAuthor(author)', () => {
|
||||
it('removes the By from an author string', () => {
|
@ -0,0 +1,39 @@
|
||||
// CLEAN AUTHOR CONSTANTS
|
||||
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i
|
||||
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
|
||||
|
||||
// CLEAN DEK CONSTANTS
|
||||
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
|
||||
// An ordered list of meta tag names that denote likely article deks.
|
||||
// From most distinct to least distinct.
|
||||
//
|
||||
// NOTE: There are currently no meta tags that seem to provide the right
|
||||
// content consistenty enough. Two options were:
|
||||
// - og:description
|
||||
// - dc.description
|
||||
// However, these tags often have SEO-specific junk in them that's not
|
||||
// header-worthy like a dek is. Excerpt material at best.
|
||||
export const DEK_META_TAGS = [
|
||||
]
|
||||
|
||||
// An ordered list of Selectors to find likely article deks. From
|
||||
// most explicit to least explicit.
|
||||
//
|
||||
// Should be more restrictive than not, as a failed dek can be pretty
|
||||
// detrimental to the aesthetics of an article.
|
||||
export const DEK_SELECTORS = [
|
||||
'.entry-summary',
|
||||
]
|
||||
|
||||
// CLEAN DATE PUBLISHED CONSTANTS
|
||||
export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i
|
||||
export const TIME_MERIDIAN_RE = /(.*\d)(am|pm)(.*)/i
|
||||
export const SPLIT_DATE_STRING = /(\d{1,2}:\d{2,2}[ap]?m?)|(\d{1,2}[\/-]\d{1,2}[\/-]\d{2,4})|(\d{1,4})|(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)/ig
|
||||
|
||||
// CLEAN TITLE CONSTANTS
|
||||
// A regular expression that will match separating characters on a
|
||||
// title, that usually denote breadcrumbs or something similar.
|
||||
export const TITLE_SPLITTERS_RE = /(: | - | \| )/g
|
||||
|
||||
export const DOMAIN_ENDINGS_RE =
|
||||
new RegExp('\.com$|\.net$|\.org$|\.co\.uk$', 'g')
|
@ -1,18 +1,18 @@
|
||||
import {
|
||||
rewriteTopLevel,
|
||||
cleanImages,
|
||||
stripJunkTags,
|
||||
cleanHOnes,
|
||||
cleanAttributes,
|
||||
cleanHeaders,
|
||||
cleanHOnes,
|
||||
cleanImages,
|
||||
cleanTags,
|
||||
cleanAttributes,
|
||||
removeEmpty,
|
||||
} from './utils/dom'
|
||||
rewriteTopLevel,
|
||||
stripJunkTags,
|
||||
} from '../utils/dom'
|
||||
|
||||
import { convertNodeTo } from '../../utils/dom'
|
||||
import { convertNodeTo } from '../extractor/utils/dom'
|
||||
|
||||
// Clean our article content, returning a new, cleaned node.
|
||||
export default function extractCleanNode(article, $, cleanConditionally=true, title='') {
|
||||
export default function extractCleanNode(article, { $, cleanConditionally=true, title='' }) {
|
||||
// do I need to copy/clone?
|
||||
// Can't I just start over w/fresh html if I need to?
|
||||
// Look into this
|
@ -1,9 +1,9 @@
|
||||
import { TEXT_LINK_RE } from './constants'
|
||||
import { stripTags } from '../../utils/dom'
|
||||
import { stripTags } from '../extractor/utils/dom'
|
||||
|
||||
// Take a dek HTML fragment, and return the cleaned version of it.
|
||||
// Return None if the dek wasn't good enough.
|
||||
export default function cleanDek(dek, $) {
|
||||
export default function cleanDek(dek, { $ }) {
|
||||
// Sanity check that we didn't get too short or long of a dek.
|
||||
if (dek.length > 1000 || dek.length < 5) return null
|
||||
|
@ -0,0 +1,26 @@
|
||||
import cleanAuthor from './author'
|
||||
import cleanImage from './lead-image-url'
|
||||
import cleanDek from './dek'
|
||||
import cleanDatePublished from './date-published'
|
||||
import cleanContent from './content'
|
||||
import cleanTitle from './title'
|
||||
|
||||
const Cleaners = {
|
||||
author: cleanAuthor,
|
||||
leadImageUrl: cleanImage,
|
||||
dek: cleanDek,
|
||||
datePublished: cleanDatePublished,
|
||||
content: cleanContent,
|
||||
title: cleanTitle,
|
||||
}
|
||||
|
||||
|
||||
export default Cleaners
|
||||
|
||||
export { cleanAuthor }
|
||||
export { cleanImage }
|
||||
export { cleanDek }
|
||||
export { cleanDatePublished }
|
||||
export { cleanContent }
|
||||
export { cleanTitle }
|
||||
export { default as resolveSplitTitle } from './resolve-split-title'
|
@ -1,6 +1,6 @@
|
||||
import assert from 'assert'
|
||||
|
||||
import clean from './clean'
|
||||
import clean from './lead-image-url'
|
||||
|
||||
describe('clean(leadImageUrl)', () => {
|
||||
it('returns the url if valid', () => {
|
@ -1,23 +0,0 @@
|
||||
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i')
|
||||
// An ordered list of meta tag names that denote likely article deks.
|
||||
// From most distinct to least distinct.
|
||||
//
|
||||
// NOTE: There are currently no meta tags that seem to provide the right
|
||||
// content consistenty enough. Two options were:
|
||||
// - og:description
|
||||
// - dc.description
|
||||
// However, these tags often have SEO-specific junk in them that's not
|
||||
// header-worthy like a dek is. Excerpt material at best.
|
||||
export const DEK_META_TAGS = [
|
||||
]
|
||||
|
||||
// An ordered list of Selectors to find likely article deks. From
|
||||
// most explicit to least explicit.
|
||||
//
|
||||
// Should be more restrictive than not, as a failed dek can be pretty
|
||||
// detrimental to the aesthetics of an article.
|
||||
export const DEK_SELECTORS = [
|
||||
'.entry-summary',
|
||||
]
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import cheerio from 'cheerio'
|
||||
import assert from 'assert'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { cleanAttributes } from './index'
|
@ -1,4 +1,4 @@
|
||||
import { convertNodeTo } from '../../../../utils/dom'
|
||||
import { convertNodeTo } from '../../extractor/utils/dom'
|
||||
|
||||
// H1 tags are typically the article title, which should be extracted
|
||||
// by the title extractor instead. If there's less than 3 of them (<3),
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { cleanHOnes } from './index'
|
@ -1,6 +1,6 @@
|
||||
import { HEADER_TAG_LIST } from '../constants'
|
||||
import { HEADER_TAG_LIST } from './constants'
|
||||
import { normalizeSpaces } from '../text'
|
||||
import { getWeight } from '../scoring'
|
||||
import { getWeight } from '../../extractor/generic/content/utils/scoring'
|
||||
|
||||
export default function cleanHeaders(article, $, title='') {
|
||||
$(HEADER_TAG_LIST, article).each((index, header) => {
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { cleanHeaders } from './index'
|
@ -1,4 +1,4 @@
|
||||
import { SPACER_RE } from '../constants'
|
||||
import { SPACER_RE } from './constants'
|
||||
|
||||
export default function cleanImages(article, $) {
|
||||
$(article).find('img').each((index, img) => {
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { cleanImages } from './index'
|
@ -1,10 +1,11 @@
|
||||
import { CLEAN_CONDITIONALLY_TAGS } from '../constants'
|
||||
import { CLEAN_CONDITIONALLY_TAGS } from './constants'
|
||||
import {
|
||||
getScore,
|
||||
setScore,
|
||||
getOrInitScore,
|
||||
scoreCommas,
|
||||
} from '../scoring'
|
||||
} from '../../extractor/generic/content/utils/scoring'
|
||||
|
||||
import { normalizeSpaces } from '../text'
|
||||
|
||||
import { linkDensity } from './index'
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { cleanTags } from './index'
|
@ -0,0 +1,28 @@
|
||||
// Spacer images to be removed
|
||||
export const SPACER_RE = new RegExp("trans|transparent|spacer|blank", "i")
|
||||
|
||||
// A list of tags to strip from the output if we encounter them.
|
||||
export const STRIP_OUTPUT_TAGS = [
|
||||
'title',
|
||||
'script',
|
||||
'noscript',
|
||||
'link',
|
||||
'style',
|
||||
'hr',
|
||||
]
|
||||
|
||||
// cleanAttributes
|
||||
export const REMOVE_ATTRS = ['style', 'align']
|
||||
export const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`)
|
||||
export const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',')
|
||||
|
||||
// removeEmpty
|
||||
export const REMOVE_EMPTY_TAGS = ['p']
|
||||
export const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',')
|
||||
|
||||
// cleanTags
|
||||
export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(',')
|
||||
|
||||
// cleanHeaders
|
||||
const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']
|
||||
export const HEADER_TAG_LIST = HEADER_TAGS.join(',')
|
@ -0,0 +1,664 @@
|
||||
const HTML = {
|
||||
// getWeight fixtures
|
||||
positiveId: `
|
||||
<div id="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
negativeId: `
|
||||
<div id="adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveClass: `
|
||||
<div class="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
negativeClass: `
|
||||
<div id="comment ad">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdAndClass: `
|
||||
<div id="article" class="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdNegClass: `
|
||||
<div id="article" class="adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positivePhotoClass: `
|
||||
<div class="figure">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdAndPhoto: `
|
||||
<div id="article" class="figure">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
entryContentAsset: `
|
||||
<div id="foo" class="entry-content-asset">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
|
||||
// stripUnlikelyCandidates
|
||||
noMatches: `
|
||||
<div id="foo">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
whitelistMatch: {
|
||||
before: `
|
||||
<div class="header">Stuff</div>
|
||||
<div class="article">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
whiteAndBlack: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
whiteInsideBlack: {
|
||||
before: `
|
||||
<div>
|
||||
<div class="adbox">
|
||||
<div class="article">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
</div>
|
||||
<div>Something unrelated</div>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<div>Something unrelated</div>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
|
||||
// brsToPs
|
||||
singleBr: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<br>
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<br>
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
doubleBrs: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<br />
|
||||
<br />
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<p> </p><p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
severalBrs: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<p> </p><p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
brsInP: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<br />
|
||||
<br />
|
||||
Here is more text
|
||||
</p>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<p>
|
||||
Here is more text
|
||||
</p></p>
|
||||
`,
|
||||
},
|
||||
paragraphize: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<br />
|
||||
Here is more text
|
||||
<span>And also this</span>
|
||||
</p>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<p>
|
||||
Here is more text
|
||||
<span>And also this</span>
|
||||
</p></p>
|
||||
`,
|
||||
},
|
||||
paragraphizeBlock: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<br />
|
||||
Here is more text
|
||||
<div>And also this</div>
|
||||
</p>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<p>
|
||||
Here is more text
|
||||
</p><div>And also this</div>
|
||||
</p>
|
||||
`,
|
||||
},
|
||||
|
||||
// convertToParagraphs
|
||||
convertToParagraphs: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<span>This should remain in a p</span>
|
||||
<br />
|
||||
<br />
|
||||
This should be wrapped in a p
|
||||
<div>This should become a p</div>
|
||||
</p>
|
||||
<span>This should become a p</span>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<span>This should remain in a p</span>
|
||||
<p>
|
||||
This should be wrapped in a p
|
||||
</p><p>This should become a p</p>
|
||||
</p> <p>This should become a p</p>
|
||||
`,
|
||||
},
|
||||
|
||||
// linkDensity
|
||||
linkDensity5: `
|
||||
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
|
||||
`,
|
||||
linkDensity1: `
|
||||
<div><p><a href="">Some text!</a></p></div>
|
||||
`,
|
||||
linkDensity0: `
|
||||
<div><p><a href=""></a></p></div>
|
||||
`,
|
||||
|
||||
// rewriteTopLevel
|
||||
rewriteHTMLBody: {
|
||||
before: `
|
||||
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
|
||||
`,
|
||||
after: `
|
||||
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
|
||||
`
|
||||
},
|
||||
|
||||
// cleanImages
|
||||
cleanSmallImages: {
|
||||
before: `
|
||||
<div>
|
||||
<img width="5" height="5" />
|
||||
<img width="50" />
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<img width="50">
|
||||
</div>
|
||||
`
|
||||
},
|
||||
cleanHeight: {
|
||||
before: `
|
||||
<div>
|
||||
<img width="50" height="50" />
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<img width="50">
|
||||
</div>
|
||||
`
|
||||
},
|
||||
cleanSpacer: {
|
||||
before: `
|
||||
<div>
|
||||
<img src="/foo/bar/baz/spacer.png" />
|
||||
<img src="/foo/bar/baz/normal.png" />
|
||||
<p>Some text</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<img src="/foo/bar/baz/normal.png">
|
||||
<p>Some text</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
// stripJunkTags
|
||||
stripsJunk: {
|
||||
before: `
|
||||
<div>
|
||||
<style>.red { color: 'red'; }</style>
|
||||
<title>WOW</title>
|
||||
<link rel="asdflkjawef" />
|
||||
<p>What an article</p>
|
||||
<script type="text/javascript">alert('hi!');</script>
|
||||
<noscript>Don't got it</noscript>
|
||||
<hr />
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What an article</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// stripHOnes
|
||||
removeTwoHOnes: {
|
||||
before: `
|
||||
<div>
|
||||
<h1>Look at this!</h1>
|
||||
<p>What do you think?</p>
|
||||
<h1>Can you believe it?!</h1>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
convertThreeHOnes: {
|
||||
before: `
|
||||
<div>
|
||||
<h1>Look at this!</h1>
|
||||
<p>What do you think?</p>
|
||||
<h1>Can you believe it?!</h1>
|
||||
<p>What do you think?</p>
|
||||
<h1>Can you believe it?!</h1>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<h2>Look at this!</h2>
|
||||
<p>What do you think?</p>
|
||||
<h2>Can you believe it?!</h2>
|
||||
<p>What do you think?</p>
|
||||
<h2>Can you believe it?!</h2>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// cleanAttributes
|
||||
removeStyle: {
|
||||
before: `
|
||||
<div>
|
||||
<p style="color: red;">What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
removeAlign: {
|
||||
before: `
|
||||
<div>
|
||||
<p style="color: red;" align="center">What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// removeEmpty
|
||||
removeEmptyP: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p></p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
doNotRemoveBr: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p></p>
|
||||
<div></div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<div></div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
doNotNested: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p><img src="foo/bar.jpg" /></p>
|
||||
<p><iframe src="foo/bar.jpg" /></p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p><img src="foo/bar.jpg" /></p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// cleanConditionally
|
||||
dropNegativeScore: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>
|
||||
<ul score="-10">
|
||||
<li>Foo</li>
|
||||
<li>Bar</li>
|
||||
</ul>
|
||||
</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>
|
||||
</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
removeTooManyInputs: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<div>
|
||||
<p>What is your name?</p>
|
||||
<input type="text"></input>
|
||||
<p>What is your name?</p>
|
||||
<input type="text"></input>
|
||||
<p>What is your name?</p>
|
||||
<input type="text"></input>
|
||||
</div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
removeShortNoImg: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<div>
|
||||
<p>Keep this one</p>
|
||||
<img src="asdf" />
|
||||
</div>
|
||||
<div>
|
||||
<p>Lose this one</p>
|
||||
</div>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<div>
|
||||
<p>Keep this one</p>
|
||||
<img src="asdf">
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
linkDensityHigh: {
|
||||
before: `
|
||||
<div score="0">
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
<ul score="20">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
goodScoreTooDense: {
|
||||
before: `
|
||||
<div>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
<ul score="30">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
previousEndsInColon: {
|
||||
before: `
|
||||
<div weight="40">
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<p>Now read these links: </p>
|
||||
<ul score="30">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
cleanEntryContentAsset: {
|
||||
before: `
|
||||
<div score="100">
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul score="20" class="entry-content-asset">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
|
||||
// normalizeSpaces
|
||||
normalizeSpaces: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `What do you think?`
|
||||
},
|
||||
|
||||
// cleanHeaders
|
||||
cleanFirstHeds: {
|
||||
before: `
|
||||
<div>
|
||||
<h2>Lose me</h2>
|
||||
<p>What do you think?</p>
|
||||
<h2>Keep me</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<h2>Keep me</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
cleanTitleMatch: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<h2>Title Match</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
dropWithNegativeWeight: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<h2 class="advert">Bad Class, Bad Weight</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
}
|
||||
|
||||
export default HTML
|
@ -0,0 +1,12 @@
|
||||
export { default as cleanImages } from './clean-images'
|
||||
export { default as stripJunkTags } from './strip-junk-tags'
|
||||
export { default as cleanHOnes } from './clean-h-ones'
|
||||
export { default as cleanAttributes } from './clean-attributes'
|
||||
export { default as removeEmpty } from './remove-empty'
|
||||
export { default as cleanTags } from './clean-tags'
|
||||
export { default as cleanHeaders } from './clean-headers'
|
||||
export { default as rewriteTopLevel } from './rewrite-top-level'
|
||||
export { textLength, linkDensity } from './link-density'
|
||||
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
|
||||
import { linkDensity } from './index'
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { REMOVE_EMPTY_SELECTORS } from '../constants'
|
||||
import { REMOVE_EMPTY_SELECTORS } from './constants'
|
||||
|
||||
export default function removeEmpty(article, $) {
|
||||
$(REMOVE_EMPTY_SELECTORS, article).remove()
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { removeEmpty } from './index'
|
@ -1,4 +1,4 @@
|
||||
import { convertNodeTo } from '../../../../utils/dom'
|
||||
import { convertNodeTo } from '../../extractor/utils/dom'
|
||||
|
||||
// Rewrite the tag name to div if it's a top level node like body or
|
||||
// html to avoid later complications with multiple body tags.
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { rewriteTopLevel } from './index'
|
@ -1,6 +1,6 @@
|
||||
import {
|
||||
STRIP_OUTPUT_TAGS
|
||||
} from '../constants'
|
||||
} from './constants'
|
||||
|
||||
export default function stripJunkTags(article, $) {
|
||||
$(STRIP_OUTPUT_TAGS.join(','), article).remove()
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
import { assertClean } from './test-helpers'
|
||||
|
||||
import { stripJunkTags } from './index'
|
@ -0,0 +1,17 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
import HTML from './fixtures/html'
|
||||
|
||||
export function clean(string) {
|
||||
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
|
||||
}
|
||||
|
||||
export function assertBeforeAndAfter(key, fn) {
|
||||
const $ = cheerio.load(HTML[key].before)
|
||||
assert.equal(clean(fn($).html()), clean(HTML[key].after))
|
||||
}
|
||||
|
||||
export function assertClean(a, b) {
|
||||
assert.equal(clean(a), clean(b))
|
||||
}
|
||||
|
@ -0,0 +1,664 @@
|
||||
const HTML = {
|
||||
// getWeight fixtures
|
||||
positiveId: `
|
||||
<div id="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
negativeId: `
|
||||
<div id="adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveClass: `
|
||||
<div class="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
negativeClass: `
|
||||
<div id="comment ad">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdAndClass: `
|
||||
<div id="article" class="entry">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdNegClass: `
|
||||
<div id="article" class="adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positivePhotoClass: `
|
||||
<div class="figure">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
positiveIdAndPhoto: `
|
||||
<div id="article" class="figure">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
entryContentAsset: `
|
||||
<div id="foo" class="entry-content-asset">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
|
||||
// stripUnlikelyCandidates
|
||||
noMatches: `
|
||||
<div id="foo">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
whitelistMatch: {
|
||||
before: `
|
||||
<div class="header">Stuff</div>
|
||||
<div class="article">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
whiteAndBlack: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
whiteInsideBlack: {
|
||||
before: `
|
||||
<div>
|
||||
<div class="adbox">
|
||||
<div class="article">
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
</div>
|
||||
<div>Something unrelated</div>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<div>Something unrelated</div>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
|
||||
// brsToPs
|
||||
singleBr: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<br>
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<br>
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
doubleBrs: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<br />
|
||||
<br />
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<p> </p><p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
severalBrs: {
|
||||
before: `
|
||||
<div class="article adbox">
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
<p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div class="article adbox">
|
||||
<p> </p><p>Ooo good one</p>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
brsInP: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<br />
|
||||
<br />
|
||||
Here is more text
|
||||
</p>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<p>
|
||||
Here is more text
|
||||
</p></p>
|
||||
`,
|
||||
},
|
||||
paragraphize: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<br />
|
||||
Here is more text
|
||||
<span>And also this</span>
|
||||
</p>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<p>
|
||||
Here is more text
|
||||
<span>And also this</span>
|
||||
</p></p>
|
||||
`,
|
||||
},
|
||||
paragraphizeBlock: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<br />
|
||||
Here is more text
|
||||
<div>And also this</div>
|
||||
</p>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<p>
|
||||
Here is more text
|
||||
</p><div>And also this</div>
|
||||
</p>
|
||||
`,
|
||||
},
|
||||
|
||||
// convertToParagraphs
|
||||
convertToParagraphs: {
|
||||
before: `
|
||||
<p>
|
||||
Here is some text
|
||||
<span>This should remain in a p</span>
|
||||
<br />
|
||||
<br />
|
||||
This should be wrapped in a p
|
||||
<div>This should become a p</div>
|
||||
</p>
|
||||
<span>This should become a p</span>
|
||||
`,
|
||||
after: `
|
||||
<p>
|
||||
Here is some text
|
||||
<span>This should remain in a p</span>
|
||||
<p>
|
||||
This should be wrapped in a p
|
||||
</p><p>This should become a p</p>
|
||||
</p> <p>This should become a p</p>
|
||||
`,
|
||||
},
|
||||
|
||||
// linkDensity
|
||||
linkDensity5: `
|
||||
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>
|
||||
`,
|
||||
linkDensity1: `
|
||||
<div><p><a href="">Some text!</a></p></div>
|
||||
`,
|
||||
linkDensity0: `
|
||||
<div><p><a href=""></a></p></div>
|
||||
`,
|
||||
|
||||
// rewriteTopLevel
|
||||
rewriteHTMLBody: {
|
||||
before: `
|
||||
<html><body><div><p><a href="">Wow how about that</a></p></div></body></html>
|
||||
`,
|
||||
after: `
|
||||
<div><div><div><p><a href="">Wow how about that</a></p></div></div></div>
|
||||
`
|
||||
},
|
||||
|
||||
// cleanImages
|
||||
cleanSmallImages: {
|
||||
before: `
|
||||
<div>
|
||||
<img width="5" height="5" />
|
||||
<img width="50" />
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<img width="50">
|
||||
</div>
|
||||
`
|
||||
},
|
||||
cleanHeight: {
|
||||
before: `
|
||||
<div>
|
||||
<img width="50" height="50" />
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<img width="50">
|
||||
</div>
|
||||
`
|
||||
},
|
||||
cleanSpacer: {
|
||||
before: `
|
||||
<div>
|
||||
<img src="/foo/bar/baz/spacer.png" />
|
||||
<img src="/foo/bar/baz/normal.png" />
|
||||
<p>Some text</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<img src="/foo/bar/baz/normal.png">
|
||||
<p>Some text</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
// stripJunkTags
|
||||
stripsJunk: {
|
||||
before: `
|
||||
<div>
|
||||
<style>.red { color: 'red'; }</style>
|
||||
<title>WOW</title>
|
||||
<link rel="asdflkjawef" />
|
||||
<p>What an article</p>
|
||||
<script type="text/javascript">alert('hi!');</script>
|
||||
<noscript>Don't got it</noscript>
|
||||
<hr />
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What an article</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// stripHOnes
|
||||
removeTwoHOnes: {
|
||||
before: `
|
||||
<div>
|
||||
<h1>Look at this!</h1>
|
||||
<p>What do you think?</p>
|
||||
<h1>Can you believe it?!</h1>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
convertThreeHOnes: {
|
||||
before: `
|
||||
<div>
|
||||
<h1>Look at this!</h1>
|
||||
<p>What do you think?</p>
|
||||
<h1>Can you believe it?!</h1>
|
||||
<p>What do you think?</p>
|
||||
<h1>Can you believe it?!</h1>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<h2>Look at this!</h2>
|
||||
<p>What do you think?</p>
|
||||
<h2>Can you believe it?!</h2>
|
||||
<p>What do you think?</p>
|
||||
<h2>Can you believe it?!</h2>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// cleanAttributes
|
||||
removeStyle: {
|
||||
before: `
|
||||
<div>
|
||||
<p style="color: red;">What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
removeAlign: {
|
||||
before: `
|
||||
<div>
|
||||
<p style="color: red;" align="center">What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// removeEmpty
|
||||
removeEmptyP: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p></p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
doNotRemoveBr: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p></p>
|
||||
<div></div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<div></div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
doNotNested: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p><img src="foo/bar.jpg" /></p>
|
||||
<p><iframe src="foo/bar.jpg" /></p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p><img src="foo/bar.jpg" /></p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
// cleanConditionally
|
||||
dropNegativeScore: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>
|
||||
<ul score="-10">
|
||||
<li>Foo</li>
|
||||
<li>Bar</li>
|
||||
</ul>
|
||||
</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>
|
||||
</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
removeTooManyInputs: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<div>
|
||||
<p>What is your name?</p>
|
||||
<input type="text"></input>
|
||||
<p>What is your name?</p>
|
||||
<input type="text"></input>
|
||||
<p>What is your name?</p>
|
||||
<input type="text"></input>
|
||||
</div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
removeShortNoImg: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<div>
|
||||
<p>Keep this one</p>
|
||||
<img src="asdf" />
|
||||
</div>
|
||||
<div>
|
||||
<p>Lose this one</p>
|
||||
</div>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<div>
|
||||
<p>Keep this one</p>
|
||||
<img src="asdf">
|
||||
</div>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
|
||||
linkDensityHigh: {
|
||||
before: `
|
||||
<div score="0">
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
<ul score="20">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
goodScoreTooDense: {
|
||||
before: `
|
||||
<div>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
<ul score="30">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
<li>Keep this one</li>
|
||||
</ul>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
previousEndsInColon: {
|
||||
before: `
|
||||
<div weight="40">
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<p>Now read these links: </p>
|
||||
<ul score="30">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
cleanEntryContentAsset: {
|
||||
before: `
|
||||
<div score="100">
|
||||
<p>Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor. Aenean massa. Cum sociis natoque penatibus et magnis dis parturient montes, nascetur ridiculus mus. Donec quam felis, ultricies nec, pellentesque eu, pretium quis, sem. Nulla consequat massa quis enim. Donec pede justo, fringilla vel, aliquet nec, vulputate eget, arcu. In enim justo, rhoncus ut, imperdiet a, venenatis vitae, justo. Nullam dictum felis eu pede mollis pretium. Integer tincidunt. Cras dapibus. Vivamus elementum semper nisi. Aenean vulputate eleifend tellus. Aenean leo ligula, porttitor eu.</p>
|
||||
<ul score="20" class="entry-content-asset">
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
<li><a href="#">Lose this one</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
`,
|
||||
},
|
||||
|
||||
// normalizeSpaces
|
||||
normalizeSpaces: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `What do you think?`
|
||||
},
|
||||
|
||||
// cleanHeaders
|
||||
cleanFirstHeds: {
|
||||
before: `
|
||||
<div>
|
||||
<h2>Lose me</h2>
|
||||
<p>What do you think?</p>
|
||||
<h2>Keep me</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<h2>Keep me</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
cleanTitleMatch: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<h2>Title Match</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
dropWithNegativeWeight: {
|
||||
before: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<h2 class="advert">Bad Class, Bad Weight</h2>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`,
|
||||
after: `
|
||||
<div>
|
||||
<p>What do you think?</p>
|
||||
<p>What do you think?</p>
|
||||
</div>
|
||||
`
|
||||
},
|
||||
}
|
||||
|
||||
export default HTML
|
@ -1,7 +1,7 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from '../fixtures/html'
|
||||
import HTML from './fixtures/html'
|
||||
|
||||
import { normalizeSpaces } from './index'
|
||||
|
Loading…
Reference in New Issue