chore: code reorganization

Squashed commit of the following:

commit 636296841d5cf5e685237fe70db7a15305d8e966
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 9 13:37:21 2016 -0400

    final cleanup

commit 51f712b3074d41a1f2da91519289d4dd09719ad0
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 9 13:25:28 2016 -0400

    Another big pass

commit 3860e6d872a9adb9290093fd9c8708dfcc773c28
Author: Adam Pash <adam.pash@gmail.com>
Date:   Fri Sep 9 12:49:52 2016 -0400

    chore: started reorganizing
pull/1/head
Adam Pash 8 years ago
parent f2729a5ee6
commit c48e3485c0

@ -3,6 +3,13 @@
"plugins": [
"transform-es2015-destructuring",
"transform-object-rest-spread",
"transform-async-to-generator"
"transform-async-to-generator",
["module-alias", [
{ "src": "./src/utils", "expose": "utils" },
{ "src": "./src/cleaners", "expose": "cleaners" },
{ "src": "./src/resource", "expose": "resource" },
{ "src": "./src/extractors", "expose": "extractors" },
{ "src": "./src/test-helpers.js", "expose": "test-helpers" }
]]
]
}

@ -1,6 +1,5 @@
TODO:
- remove logic for fetching meta attrs with custom props
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
- extractNextPageUrl
- Rename all cleaners from cleanThing to clean
- Make sure weightNodes flag is being passed properly
@ -19,6 +18,7 @@ x extract and generalize cleaners
x move arguments to cleaners to object
x Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
x extractLeadImageUrl
x Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
x extractDek
x extractDatePublished
x Title metadata

@ -12,6 +12,7 @@
"license": "ISC",
"devDependencies": {
"babel-plugin-external-helpers": "^6.8.0",
"babel-plugin-module-alias": "^1.6.0",
"babel-plugin-transform-async-to-generator": "^6.8.0",
"babel-plugin-transform-es2015-destructuring": "^6.9.0",
"babel-plugin-transform-object-rest-spread": "^6.8.0",

@ -8,9 +8,9 @@ import {
rewriteTopLevel,
stripJunkTags,
makeLinksAbsolute,
} from '../utils/dom'
} from 'utils/dom'
import { convertNodeTo } from '../extractor/utils/dom'
import { convertNodeTo } from 'utils/dom'
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(

@ -3,7 +3,7 @@ import cheerio from 'cheerio'
import fs from 'fs'
import extractCleanNode from './content'
import extractBestNode from '../extractor/generic/content/extract-best-node'
import extractBestNode from 'extractors/generic/content/extract-best-node'
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it("cleans cruft out of a DOM node", () => {

@ -1,5 +1,5 @@
import { TEXT_LINK_RE } from './constants'
import { stripTags } from '../extractor/utils/dom'
import { stripTags } from 'utils/dom'
// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.

@ -1,6 +1,6 @@
import { TITLE_SPLITTERS_RE } from './constants'
import { resolveSplitTitle } from './index'
import { stripTags } from '../extractor/utils/dom'
import { stripTags } from 'utils/dom'
export default function cleanTitle(title, { url, $ }) {
// If title has |, :, or - in it, see if

@ -1,5 +0,0 @@
// DOM manipulation
export { default as stripUnlikelyCandidates } from './strip-unlikely-candidates'
export { default as brsToPs } from './brs-to-ps'
export { default as paragraphize } from './paragraphize'
export { convertToParagraphs, convertNodeTo } from './convert-to-paragraphs'

@ -1,17 +0,0 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from '../fixtures/html'
export function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
}
export function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before)
assert.equal(clean(fn($).html()), clean(HTML[key].after))
}
export function assertClean(a, b) {
assert.equal(clean(a), clean(b))
}

@ -1,3 +0,0 @@
export { default as extractFromMeta } from './extract-from-meta'
export { default as extractFromSelectors } from './extract-from-selectors'
export { default as extractFromUrl } from './extract-from-url'

@ -1,4 +0,0 @@
export { default as withinComment } from './within-comment'
export { default as convertNodeTo } from './convert-node-to'
export { default as stripTags } from './strip-tags'

@ -1 +0,0 @@
export { default as nodeIsSufficient } from './node-is-sufficient'

@ -5,12 +5,12 @@ import {
BYLINE_SELECTORS_RE,
} from './constants'
import { cleanAuthor } from '../../../cleaners'
import { cleanAuthor } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors
} from '../utils'
} from 'utils/dom'
const GenericAuthorExtractor = {
extract({ $, metaCache }) {

@ -5,7 +5,7 @@ import {
import {
stripUnlikelyCandidates,
convertToParagraphs,
} from './utils/dom'
} from 'utils/dom'
// Using a variety of scoring techniques, extract the content most
// likely to be article text.

@ -2,9 +2,9 @@ import cheerio from 'cheerio'
import 'babel-polyfill'
import extractBestNode from './extract-best-node'
import nodeIsSufficient from '../../utils/node-is-sufficient'
import { cleanContent } from '../../../cleaners'
import { normalizeSpaces } from '../../../utils/text'
import { nodeIsSufficient } from 'utils/dom'
import { cleanContent } from 'cleaners'
import { normalizeSpaces } from 'utils/text'
const GenericContentExtractor = {
defaultOpts: {

@ -2,7 +2,7 @@ import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import { clean } from './utils/dom/test-helpers'
import { clean } from 'test-helpers'
import GenericContentExtractor from './extractor'

@ -3,7 +3,7 @@ import { getScore } from './index'
import {
textLength,
linkDensity
} from '../../../../../utils/dom'
} from 'utils/dom'
// After we've calculated scores, loop through all of the possible
// candidate nodes we found and find the one with the highest score.

@ -8,7 +8,7 @@ import {
addScore,
} from './index'
import { convertNodeTo } from '../../../../utils/dom'
import { convertNodeTo } from 'utils/dom'
// score content. Parents get the full value of their children's
// content score, grandparents half

@ -2,7 +2,7 @@ import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import { clean } from '../dom/test-helpers'
import { clean } from 'test-helpers'
import HTML from './fixtures/html'
import {

@ -4,13 +4,13 @@ import {
DATE_PUBLISHED_URL_RES,
} from './constants'
import { cleanDatePublished } from '../../../cleaners'
import { cleanDatePublished } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors,
extractFromUrl,
} from '../utils'
} from 'utils/dom'
import { extractFromUrl } from 'utils/text'
const GenericDatePublishedExtractor = {
extract({ $, url, metaCache }) {

@ -4,13 +4,12 @@ import {
DEK_URL_RES,
} from './constants'
import { cleanDek } from '../../../cleaners'
import { cleanDek } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors,
extractFromUrl,
} from '../utils'
} from 'utils/dom'
// Currently there is only one selector for
// deks. We should simply return null here

@ -1,7 +1,7 @@
import assert from 'assert'
import fs from 'fs'
import { clean } from './content/utils/dom/test-helpers'
import { clean } from 'test-helpers'
import GenericExtractor from './index'

@ -8,7 +8,7 @@ import {
import {
extractFromMeta,
extractFromSelectors
} from '../utils'
} from 'utils/dom'
import {
scoreImageUrl,
@ -19,7 +19,7 @@ import {
scoreByPosition,
} from './score-image'
import { cleanImage } from '../../../cleaners'
import { cleanImage } from 'cleaners'
// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system

@ -4,11 +4,11 @@ import {
STRONG_TITLE_SELECTORS,
WEAK_TITLE_SELECTORS
} from './constants'
import { cleanTitle } from '../../../cleaners'
import { cleanTitle } from 'cleaners'
import {
extractFromMeta,
extractFromSelectors
} from '../utils'
} from 'utils/dom'
const GenericTitleExtractor = {
extract({ $, url, metaCache }) {

@ -1,8 +1,8 @@
import 'babel-polyfill'
import GenericExtractor from './generic'
import Cleaners from '../cleaners'
import { convertNodeTo, stripTags } from './utils/dom'
import Cleaners from 'cleaners'
import { convertNodeTo, stripTags } from 'utils/dom'
import { ATTR_RE } from './constants'
const RootExtractor = {

@ -15,7 +15,7 @@ import NYMagExtractor from './custom/nymag.com'
describe('RootExtractor', () => {
it('extracts based on custom selectors', () => {
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8')
const $ = cheerio.load(html)
const {

@ -1,10 +1,8 @@
import fs from 'fs'
import Resource from './resource'
import getExtractor from './extractor/get-extractor'
import RootExtractor from './extractor/root-extractor'
import fetchResource from './resource/utils/fetch-resource'
import Resource from 'resource'
import getExtractor from 'extractors/get-extractor'
import RootExtractor from 'extractors/root-extractor'
const Iris = {
parse: async function(url, html) {

@ -1,16 +1,10 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
export function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
}
export function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before)
assert.equal(clean(fn($).html()), clean(HTML[key].after))
}
export function assertClean(a, b) {
assert.equal(clean(a), clean(b))
}

@ -1,11 +1,14 @@
import assert from 'assert'
import cheerio from 'cheerio'
import { assertBeforeAndAfter } from './test-helpers'
import HTML from '../fixtures/html'
import {
brsToPs
} from './index'
import { assertClean } from 'test-helpers'
import HTML from './fixtures/html'
import brsToPs from './brs-to-ps'
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before)
assertClean(fn($).html(), HTML[key].after)
}
describe('Generic Extractor Utils', () => {
describe('brsToPs(node)', () => {

@ -2,7 +2,7 @@ import cheerio from 'cheerio'
import assert from 'assert'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { assertClean } from 'test-helpers'
import { cleanAttributes } from './index'

@ -1,4 +1,4 @@
import { convertNodeTo } from '../../extractor/utils/dom'
import { convertNodeTo } from 'utils/dom'
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),

@ -2,7 +2,7 @@ import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { assertClean } from 'test-helpers'
import { cleanHOnes } from './index'

@ -1,6 +1,6 @@
import { HEADER_TAG_LIST } from './constants'
import { normalizeSpaces } from '../text'
import { getWeight } from '../../extractor/generic/content/utils/scoring'
import { getWeight } from 'extractors/generic/content/utils/scoring'
export default function cleanHeaders(article, $, title='') {
$(HEADER_TAG_LIST, article).each((index, header) => {

@ -2,7 +2,7 @@ import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { assertClean } from 'test-helpers'
import { cleanHeaders } from './index'

@ -2,7 +2,7 @@ import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { assertClean } from 'test-helpers'
import { cleanImages } from './index'

@ -4,7 +4,7 @@ import {
setScore,
getOrInitScore,
scoreCommas,
} from '../../extractor/generic/content/utils/scoring'
} from 'extractors/generic/content/utils/scoring'
import { normalizeSpaces } from '../text'

@ -2,7 +2,7 @@ import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import { assertClean } from './test-helpers'
import { assertClean } from 'test-helpers'
import { cleanTags } from './index'

@ -28,3 +28,411 @@ export const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div'].join(',')
// cleanHeaders
const HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6']
export const HEADER_TAG_LIST = HEADER_TAGS.join(',')
//// CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
// extracting content from a resource. These strings are joined together
// and then tested for existence using re:test, so may contain simple,
// non-pipe style regular expression queries if necessary.
export const UNLIKELY_CANDIDATES_BLACKLIST = [
'ad-break',
'adbox',
'advert',
'addthis',
'agegate',
'aux',
'blogger-labels',
'combx',
'comment',
'conversation',
'disqus',
'entry-unrelated',
'extra',
'foot',
'form',
'header',
'hidden',
'loader',
'login', // Note: This can hit 'blogindex'.
'menu',
'meta',
'nav',
'pager',
'pagination',
'predicta', // readwriteweb inline ad box
'presence_control_external', // lifehacker.com container full of false positives
'popup',
'printfriendly',
'related',
'remove',
'remark',
'rss',
'share',
'shoutbox',
'sidebar',
'sociable',
'sponsor',
'tools'
]
// A list of strings that can be considered LIKELY candidates when
// extracting content from a resource. Essentially, the inverse of the
// blacklist above - if something matches both blacklist and whitelist,
// it is kept. This is useful, for example, if something has a className
// of "rss-content entry-content". It matched 'rss', so it would normally
// be removed, however, it's also the entry content, so it should be left
// alone.
//
// These strings are joined together and then tested for existence using
// re:test, so may contain simple, non-pipe style regular expression queries
// if necessary.
export const UNLIKELY_CANDIDATES_WHITELIST = [
'and',
'article',
'body',
'blogindex',
'column',
'content',
'entry-content-asset',
'format', // misuse of form
'hfeed',
'hentry',
'hatom',
'main',
'page',
'posts',
'shadow'
]
// A list of tags which, if found inside, should cause a <div /> to NOT
// be turned into a paragraph tag. Shallow div tags without these elements
// should be turned into <p /> tags.
export const DIV_TO_P_BLOCK_TAGS = [
'a',
'blockquote',
'dl',
'div',
'img',
'p',
'pre',
'table',
].join(',')
// A list of tags that should be ignored when trying to find the top candidate
// for a document.
export const NON_TOP_CANDIDATE_TAGS = [
'br',
'b',
'i',
'label',
'hr',
'area',
'base',
'basefont',
'input',
'img',
'link',
'meta',
]
export const NON_TOP_CANDIDATE_TAGS_RE =
new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i')
// A list of selectors that specify, very clearly, either hNews or other
// very content-specific style content, like Blogger templates.
// More examples here: http://microformats.org/wiki/blog-post-formats
export const HNEWS_CONTENT_SELECTORS = [
['.hentry', '.entry-content'],
['entry', '.entry-content'],
['.entry', '.entry_content'],
['.post', '.postbody'],
['.post', '.post_body'],
['.post', '.post-body'],
]
// export const HNEWS_CONTENT_SELECTORS = [
// {
// //selector: XPath('/#<{(|[contains(@class, "hentry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['hentry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry-content")]'),
// must_exist: {
// classes: ['entry', 'entry-content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "entry")]/#<{(|[contains(@class, "entry_content")]'),
// must_exist: {
// classes: ['entry', 'entry_content'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post-body")]'),
// must_exist: {
// classes: ['post', 'post-body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "post_body")]'),
// must_exist: {
// classes: ['post', 'post_body'],
// }
// },
// {
// //selector: XPath('/#<{(|[contains(@class, "post")]/#<{(|[contains(@class, "postbody")]'),
// must_exist: {
// classes: ['post', 'postbody'],
// }
// },
// ]
export const PHOTO_HINTS = [
'figure',
'photo',
'image',
'caption'
]
export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
// A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
export const POSITIVE_SCORE_HINTS = [
'article',
'articlecontent',
'instapaper_body',
'blog',
'body',
'content',
'entry-content-asset',
'entry',
'hentry',
'main',
'Normal',
'page',
'pagination',
'permalink',
'post',
'story',
'text',
'[-_]copy', //usatoday
'\Bcopy'
]
// The above list, joined into a matching regular expression
export const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
// Readability publisher-specific guidelines
export const READABILITY_ASSET = new RegExp('entry-content-asset', 'i')
// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
export const NEGATIVE_SCORE_HINTS = [
'adbox',
'advert',
'author',
'bio',
'bookmark',
'bottom',
'byline',
'clear',
'com-',
'combx',
'comment',
'comment\B',
'contact',
'copy',
'credit',
'crumb',
'date',
'deck',
'excerpt',
'featured', //tnr.com has a featured_content which throws us off
'foot',
'footer',
'footnote',
'graf',
'head',
'info',
'infotext', //newscientist.com copyright
'instapaper_ignore',
'jump',
'linebreak',
'link',
'masthead',
'media',
'meta',
'modal',
'outbrain', //slate.com junk
'promo',
'pr_', // autoblog - press release
'related',
'respond',
'roundcontent', //lifehacker restricted content warning
'scroll',
'secondary',
'share',
'shopping',
'shoutbox',
'side',
'sidebar',
'sponsor',
'stamp',
'sub',
'summary',
'tags',
'tools',
'widget'
]
// The above list, joined into a matching regular expression
export const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
// XPath to try to determine if a page is wordpress. Not always successful.
export const IS_WP_XPATH = "//meta[@name='generator'][starts-with(@value,'WordPress')]"
// Match a digit. Pretty clear.
export const DIGIT_RE = new RegExp('[0-9]')
// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
export const EXTRANEOUS_LINK_HINTS = [
'print',
'archive',
'comment',
'discuss',
'e-mail',
'email',
'share',
'reply',
'all',
'login',
'sign',
'single',
'adx',
'entry-unrelated'
]
export const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
// An expression that looks to try to find the page digit within a URL, if
// it exists.
// Matches:
// page=1
// pg=1
// p=1
// paging=12
// pag=7
// pagination/1
// paging/88
// pa/83
// p/11
//
// Does not match:
// pg=102
// page:2
// DISABLING FOR NOW TODO AP
// export const PAGE_IN_HREF_RE = new RegExp('(page|paging|(p(a|g|ag)?(e|enum|ewanted|ing|ination)?(=|\/)(?P<pagenum>[0-9]{1,2})))', 'i')
// Match any phrase that looks like it could be page, or paging, or pagination
export const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i')
// Match any link text/classname/id that looks like it could mean the next
// page. Things like: next, continue, >, >>, » but not >|, »| as those can
// mean last page.
export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\|]|$)|»([^\|]|$))', 'i')
// Match any link text/classname/id that looks like it is an end link: things
// like "first", "last", "end", etc.
export const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i')
// Match any link text/classname/id that looks like it means the previous
// page.
export const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i')
// Match 2 or more consecutive <br> tags
export const BR_TAGS_RE = new RegExp('(<br[^>]*>[ \n\r\t]*){2,}', 'i')
// Match 1 BR tag.
export const BR_TAG_RE = new RegExp('<br[^>]*>', 'i')
// A list of all of the block level tags known in HTML5 and below. Taken from
// http://bit.ly/qneNIT
export const BLOCK_LEVEL_TAGS = [
'article',
'aside',
'blockquote',
'body',
'br',
'button',
'canvas',
'caption',
'col',
'colgroup',
'dd',
'div',
'dl',
'dt',
'embed',
'fieldset',
'figcaption',
'figure',
'footer',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'header',
'hgroup',
'hr',
'li',
'map',
'object',
'ol',
'output',
'p',
'pre',
'progress',
'section',
'table',
'tbody',
'textarea',
'tfoot',
'th',
'thead',
'tr',
'ul',
'video',
]
export const BLOCK_LEVEL_TAGS_RE = new RegExp(`^(${BLOCK_LEVEL_TAGS.join('|')})$`, 'i')
// The removal is implemented as a blacklist and whitelist, this test finds
// blacklisted elements that aren't whitelisted. We do this all in one
// expression-both because it's only one pass, and because this skips the
// serialization for whitelisted nodes.
const candidates_blacklist = UNLIKELY_CANDIDATES_BLACKLIST.join('|')
export const CANDIDATES_BLACKLIST = new RegExp(candidates_blacklist, 'i')
const candidates_whitelist = UNLIKELY_CANDIDATES_WHITELIST.join('|')
export const CANDIDATES_WHITELIST = new RegExp(candidates_whitelist, 'i')
export const UNLIKELY_RE = new RegExp(`!(${candidates_whitelist})|(${candidates_blacklist})`, 'i')
export const PARAGRAPH_SCORE_TAGS = new RegExp('^(p|li|span|pre)$', 'i')
export const CHILD_CONTENT_TAGS = new RegExp('^(td|blockquote|ol|ul|dl)$', 'i')
export const BAD_TAGS = new RegExp('^(address|form)$', 'i')
export const HTML_OR_BODY_RE = new RegExp('^(html|body)$', 'i')

@ -2,4 +2,3 @@ export default function convertNodeTo(node, $, tag='p') {
$(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
return $
}

@ -1,7 +1,7 @@
import { convertNodeTo } from '../../../../utils/dom'
import { convertNodeTo } from 'utils/dom'
import { brsToPs } from './index'
import { DIV_TO_P_BLOCK_TAGS } from '../constants'
import { DIV_TO_P_BLOCK_TAGS } from './constants'
// Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
@ -14,7 +14,7 @@ import { DIV_TO_P_BLOCK_TAGS } from '../constants'
// :return cheerio object with new p elements
// (By-reference mutation, though. Returned just for convenience.)
export function convertToParagraphs($) {
export default function convertToParagraphs($) {
$ = brsToPs($)
$ = convertDivs($)
$ = convertSpans($)

@ -1,15 +1,15 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
assertBeforeAndAfter,
assertClean
} from './test-helpers'
import HTML from '../fixtures/html'
import {
convertToParagraphs
} from './index'
import { assertClean } from 'test-helpers'
import HTML from './fixtures/html'
import convertToParagraphs from './convert-to-paragraphs'
function assertBeforeAndAfter(key, fn) {
const $ = cheerio.load(HTML[key].before)
assertClean(fn($).html(), HTML[key].after)
}
describe('Generic Extractor Utils', () => {
describe('convertToParagraphs($)', () => {

@ -1,4 +1,4 @@
import { stripTags } from '../../utils/dom'
import { stripTags } from 'utils/dom'
// Given a node type to search for, and a list of meta tag names to
// search for, find a meta tag associated.

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save