feat: RootExtractor performs extraction using custom and generic

extraction methods
pull/1/head
Adam Pash 8 years ago
parent 937138c7bb
commit 7d88fee199

@ -1,4 +1,5 @@
TODO:
- change customselector to rootselector. consider other options for generalizing cleaning (use generic cleaners)
- run makeLinksAbsolute on extracted content before returning
- remove logic for fetching meta attrs with custom props
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)

@ -3,7 +3,7 @@ import babelrc from 'babelrc-rollup'
import commonjs from 'rollup-plugin-commonjs'
export default {
entry: 'src/index.js',
entry: 'src/iris.js',
plugins: [
commonjs(),
babel(babelrc()),

@ -1,58 +0,0 @@
import GenericExtractor from '../generic'
import { stripTags } from '../utils'
const CustomExtractor = {
extract(extractor=GenericExtractor, opts) {
const { $ } = opts
if (extractor.domain === '*') return extractor.parse(opts)
const title = extract({ ...opts, type: 'title', extractor })
const datePublished = extract({ ...opts, type: 'datePublished', extractor })
const author = extract({ ...opts, type: 'author', extractor })
const content = extract({ ...opts, type: 'content', extractor, html: true })
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', extractor, html: true })
const dek = extract({ ...opts, type: 'dek', extractor, html: true })
return {
title,
content,
datePublished,
leadImageUrl,
dek,
}
}
}
function extract(opts) {
const { type, extractor, $ } = opts
return select($, extractor[type]) ||
GenericExtractor[type](opts)
}
function select($, selectObj, html=false) {
if (!selectObj) return
const { selectors } = selectObj
if (!selectors) return
const matchingSelector = selectors.find((selector) => {
return $(selector).length === 1
})
if (!matchingSelector) return
if (html) {
let $content = $(matchingSelector)
$content = cleanBySelectors($content, $, selectObj)
} else {
return stripTags($(matchingSelector).text(), $)
}
}
function cleanBySelectors($content, $, selectObj) {
const { clean } = selectObj
$(clean.join(','), $content).remove()
return $content
}
export default CustomExtractor

@ -1,27 +0,0 @@
import assert from 'assert'
import fs from 'fs'
import cheerio from 'cheerio'
import CustomExtractor from './extractor'
import GenericExtractor from '../generic'
import NYMagExtractor from './nymag.com'
describe('CustomExtractor', () => {
it('extracts based on custom selectors', () => {
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
const $ = cheerio.load(html)
const {
title,
content,
author,
datePublished,
leadImageUrl,
} = CustomExtractor.extract(
NYMagExtractor, { url, html, $, metaCache: [] }
)
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
})
})

@ -15,13 +15,13 @@ const NYMagExtractor = {
],
// Array of tranformations to make on matched elements
// Each item in the array is an object. They key is the
// Each item in the array is an object. They key is the
// selector, the value is a tranformation function
// for the matching node.
transforms: [
// Convert h1s to h2s
{
'h1': ($node) => convertNodeTo($node, $, 'h2')
'h1': 'h2'
},
// Convert lazy-loaded noscript images to figures
@ -29,7 +29,7 @@ const NYMagExtractor = {
'noscript': ($node) => {
const $children = $node.children()
if ($children.length === 1 && $children.get(0).tagName === 'img') {
convertNodeTo($node, $, 'figure')
return 'figure'
}
}
}

@ -1,5 +1,4 @@
import {
convertNodeTo,
rewriteTopLevel,
cleanImages,
stripJunkTags,
@ -10,6 +9,8 @@ import {
removeEmpty,
} from './utils/dom'
import { convertNodeTo } from '../../utils/dom'
// Clean our article content, returning a new, cleaned node.
export default function extractCleanNode(article, $, cleanConditionally=true, title='') {
// do I need to copy/clone?

@ -1,4 +1,4 @@
import { convertNodeTo } from './index'
import { convertNodeTo } from '../../../../utils/dom'
// H1 tags are typically the article title, which should be extracted
// by the title extractor instead. If there's less than 3 of them (<3),

@ -1,3 +1,5 @@
import { convertNodeTo } from '../../../../utils/dom'
import { brsToPs } from './index'
import { DIV_TO_P_BLOCK_TAGS } from '../constants'
// Loop through the provided doc, and convert any p-like elements to
@ -42,8 +44,3 @@ function convertSpans($) {
return $
}
export function convertNodeTo(node, $, tag='p') {
$(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
return $
}

@ -11,8 +11,6 @@ import {
convertToParagraphs
} from './index'
import { convertNodeTo } from './convert-to-paragraphs'
describe('Generic Extractor Utils', () => {
describe('convertToParagraphs($)', () => {
@ -22,18 +20,6 @@ describe('Generic Extractor Utils', () => {
})
describe('convertNodeTo(node, $)', () => {
it('takes a node with any tag and turns it into a P tag', () => {
const $ = cheerio.load(HTML.convertNodeTo.before)
const node = $('div').first()
const result = convertNodeTo(node, $).html()
assertClean(result, HTML.convertNodeTo.after)
})
})
})

@ -1,4 +1,4 @@
import { convertNodeTo } from './index'
import { convertNodeTo } from '../../../../utils/dom'
// Rewrite the tag name to div if it's a top level node like body or
// html to avoid later complications with multiple body tags.

@ -219,12 +219,6 @@ const HTML = {
`,
},
// convertNodeTo
convertNodeTo: {
before: '<div>Should become a p</div>',
after: '<p>Should become a p</p>',
},
// linkDensity
linkDensity5: `
<div><p>Some text!</p><p><a href="">Some text!</a></p> </div>

@ -7,7 +7,8 @@ import {
getOrInitScore,
addScore,
} from './index'
import { convertNodeTo } from '../dom'
import { convertNodeTo } from '../../../../utils/dom'
// score content. Parents get the full value of their children's
// content score, grandparents half

@ -1,5 +1,5 @@
import { TEXT_LINK_RE } from './constants'
import { stripTags } from '../../utils'
import { stripTags } from '../../utils/dom'
// Take a dek HTML fragment, and return the cleaned version of it.
// Return None if the dek wasn't good enough.

@ -17,7 +17,7 @@ const GenericExtractor = {
leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
parse: function(options) {
extract: function(options) {
let { html } = options
if (html) {

@ -6,8 +6,8 @@ import { clean } from './content/utils/dom/test-helpers'
import GenericExtractor from './index'
describe('GenericExtractor', () => {
describe('parse(html)', () => {
it("parses this old LA Times article", () => {
describe('extract(opts)', () => {
it("extracts this old LA Times article", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
const {
@ -16,7 +16,7 @@ describe('GenericExtractor', () => {
datePublished,
dek,
leadImageUrl,
} = GenericExtractor.parse(
} = GenericExtractor.extract(
{ url: "http://latimes.com", html, metaCache: [] }
)
@ -33,7 +33,7 @@ describe('GenericExtractor', () => {
assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
})
it("parses html and returns the article title", () => {
it("extracts html and returns the article title", () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
const {
@ -42,7 +42,7 @@ describe('GenericExtractor', () => {
datePublished,
dek,
leadImageUrl,
} = GenericExtractor.parse(
} = GenericExtractor.extract(
{ url: "http://wired.com", html, metaCache: [] }
)

@ -1,6 +1,6 @@
import { TITLE_SPLITTERS_RE } from '../constants'
import { resolveSplitTitle } from './index'
import { stripTags } from '../../../utils'
import { stripTags } from '../../../utils/dom'
export default function cleanTitle(title, url, $) {
// If title has |, :, or - in it, see if

@ -1,4 +1,4 @@
import { stripTags } from '../../utils'
import { stripTags } from '../../utils/dom'
// Given a node type to search for, and a list of meta tag names to
// search for, find a meta tag associated.

@ -0,0 +1,99 @@
import 'babel-polyfill'
import GenericExtractor from './generic'
import { convertNodeTo, stripTags } from './utils/dom'
const RootExtractor = {
extract(extractor=GenericExtractor, opts) {
const { $ } = opts
// This is the generic extractor. Run its extract method
if (extractor.domain === '*') return extractor.extract(opts)
const title = extract({ ...opts, type: 'title', extractor })
const datePublished = extract({ ...opts, type: 'datePublished', extractor })
const author = extract({ ...opts, type: 'author', extractor })
const content = extract({ ...opts, type: 'content', extractor, html: true })
const leadImageUrl = extract({ ...opts, type: 'leadImageUrl', extractor, html: true })
const dek = extract({ ...opts, type: 'dek', extractor, html: true })
return {
title,
content,
datePublished,
leadImageUrl,
dek,
}
}
}
function extract(opts) {
const { type, extractor, $ } = opts
// If nothing matches the selector,
// run the Generic extraction
return select($, extractor[type]) ||
GenericExtractor[type](opts)
}
function select($, extractionOpts, html=false) {
// Skip if there's not extraction for this type
if (!extractionOpts) return
const { selectors } = extractionOpts
const matchingSelector = selectors.find((selector) => {
return $(selector).length === 1
})
if (!matchingSelector) return
// If the selector type requests html as its return type
// clean the element with provided cleaning selectors
if (html) {
let $content = $(matchingSelector)
$content = cleanBySelectors($content, $, extractionOpts)
$content = transformElements($content, $, extractionOpts)
return $content
} else {
return stripTags($(matchingSelector).text(), $)
}
}
// Remove elements by an array of selectors
export function cleanBySelectors($content, $, { clean }) {
if (!clean) return
$(clean.join(','), $content).remove()
return $content
}
// Transform matching elements
export function transformElements($content, $, { transforms }) {
if (!transforms) return
Reflect.ownKeys(transforms).forEach((key) => {
const $matches = $(key, $content)
const value = transforms[key]
// If value is a string, convert directly
if (typeof value === 'string') {
$matches.each((index, node) => {
convertNodeTo(node, $, transforms[key])
})
} else if (typeof value === 'function') {
// If value is function, apply function to node
$matches.each((index, node) => {
const result = value($(node))
// If function returns a string, convert node to that value
if (typeof result === 'string') {
convertNodeTo(node, $, result)
}
})
}
})
return $content
}
export default RootExtractor

@ -0,0 +1,135 @@
import assert from 'assert'
import fs from 'fs'
import cheerio from 'cheerio'
import CustomExtractor from './root-extractor'
import {
cleanBySelectors,
transformElements
} from './root-extractor'
import GenericExtractor from './generic'
import NYMagExtractor from './custom/nymag.com'
describe('CustomExtractor', () => {
it('extracts based on custom selectors', () => {
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'
const html = fs.readFileSync('./src/extractor/custom/nymag.com/fixtures/test.html', 'utf8')
const $ = cheerio.load(html)
const {
title,
content,
author,
datePublished,
leadImageUrl,
} = CustomExtractor.extract(
NYMagExtractor, { url, html, $, metaCache: [] }
)
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation')
})
})
describe('cleanBySelectors($content, $, { clean })', () => {
it('removes provided selectors from the content', () => {
const opts = { clean: ['.ad', '.share'] }
const html = `
<div>
<div class="body">
<div class="share">Share this on twitter plz</div>
<p>This is some good content</p>
<div class="ad">Advertisement!</div>
</div>
</div>`
const $ = cheerio.load(html)
let $content = $('.body')
$content = cleanBySelectors($content, $, opts)
assert.equal($content.find('.ad').length, 0)
assert.equal($content.find('.share').length, 0)
})
})
describe('transformElements($content, $, { transforms })', () => {
it('performs a simple transformation on matched elements', () => {
const html = `
<div>
<div class="body">
<h1>WOW BIG TITLE</h1>
<p>Here are some words</p>
<h1>WOW BIG TITLE</h1>
</div>
</div>
`
const opts = {
transforms: { 'h1': 'h2' }
}
const $ = cheerio.load(html)
let $content = $('.body')
const after = `
<div class="body">
<h2>WOW BIG TITLE</h2>
<p>Here are some words</p>
<h2>WOW BIG TITLE</h2>
</div>
`
$content = transformElements($content, $, opts)
assertClean($.html($content), after)
})
it('performs a complex transformation on matched elements', () => {
const html = `
<div>
<div class="body">
<noscript>
<img src="/img.jpg" />
</noscript>
<noscript>
Something else
</noscript>
<p>Here are some words</p>
</div>
</div>
`
const opts = {
transforms: {
'noscript': ($node) => {
const $children = $node.children()
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure'
}
}
}
}
const $ = cheerio.load(html)
let $content = $('.body')
const after = `
<div class="body">
<figure>
<img src="/img.jpg">
</figure>
<noscript>
Something else
</noscript>
<p>Here are some words</p>
</div>
`
$content = transformElements($content, $, opts)
assertClean($.html($content), after)
})
})
export function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
}
export function assertClean(a, b) {
assert.equal(clean(a), clean(b))
}

@ -0,0 +1,5 @@
export default function convertNodeTo(node, $, tag='p') {
$(node).replaceWith(`<${tag}>${$(node).contents()}</${tag}>`)
return $
}

@ -0,0 +1,20 @@
import assert from 'assert'
import cheerio from 'cheerio'
import convertNodeTo from './convert-node-to'
describe('convertNodeTo(node, $)', () => {
it('takes a node and converts it to a diff tag', () => {
const html = '<div>Should become a p</div>'
const $ = cheerio.load(html)
const node = $('div').first()
const result = convertNodeTo(node, $).html()
const after = '<p>Should become a p</p>'
assert.equal(result, after)
})
})

@ -1,2 +1,4 @@
export { default as withinComment } from './within-comment'
export { default as convertNodeTo } from './convert-node-to'
export { default as stripTags } from './strip-tags'

@ -3,7 +3,7 @@ import cheerio from 'cheerio'
import { stripTags } from './index'
describe('cleanTitle(title, $)', () => {
describe('stripTags(title, $)', () => {
it('strips tags from a string of text', () => {
const $ = cheerio.load('<div></div>')

@ -1,2 +1 @@
export { default as nodeIsSufficient } from './node-is-sufficient'
export { default as stripTags } from './strip-tags'

@ -1,4 +1,6 @@
import Iris from './index'
import assert from 'assert'
import Iris from './iris'
describe('Iris', function() {
describe('parse(url)', function() {

@ -2,7 +2,7 @@ import fs from 'fs'
import Resource from './resource'
import getExtractor from './extractor/get-extractor'
import RootExtractor from './extractor/custom/extractor'
import RootExtractor from './extractor/root-extractor'
import fetchResource from './resource/utils/fetch-resource'
Loading…
Cancel
Save