feat: can now fetch attrs in RootExtractor's select method

pull/1/head
Adam Pash 8 years ago
parent 33c7e0d1c9
commit bdc2c0c1da

@ -12,6 +12,7 @@ TODO:
- Separate constants into activity-specific folders (dom, scoring)
DONE:
x add option to fetch attrs in RootExtractor's select method
x get custom datePublished selector to convert to date object (prob through cleaner)
x extract and generalize cleaners
x move arguments to cleaners to object

@ -0,0 +1 @@
export const ATTR_RE = /\[([\w-]+)\]/

@ -39,6 +39,19 @@ const NYMagExtractor = {
'h1.headline-primary',
'h1',
]
},
author: {
selectors: [
'.by-authors',
]
},
datePublished: {
selectors: [
'time.article-timestamp[datetime]',
'time.article-timestamp',
]
}
}

@ -3,6 +3,7 @@ import 'babel-polyfill'
import GenericExtractor from './generic'
import Cleaners from '../cleaners'
import { convertNodeTo, stripTags } from './utils/dom'
import { ATTR_RE } from './constants'
const RootExtractor = {
extract(extractor=GenericExtractor, opts) {
@ -44,7 +45,7 @@ function extract(opts) {
GenericExtractor[type](opts)
}
function select(opts) {
export function select(opts) {
const { $, type, extractionOpts, extractHtml=false } = opts
// Skip if there's not extraction for this type
if (!extractionOpts) return
@ -75,8 +76,17 @@ function select(opts) {
return $.html($content)
} else {
// return stripTags($(matchingSelector).text(), $)
return Cleaners[type]($(matchingSelector).text(), opts)
// if selector includes an attr (e.g., img[src]),
// extract the attr
const attr = matchingSelector.match(ATTR_RE)
let result
if (attr) {
result = $(matchingSelector).attr(attr[1])
} else {
// otherwise use the text of the node
result = $(matchingSelector).text()
}
return Cleaners[type](result, opts)
}
}

@ -3,6 +3,7 @@ import fs from 'fs'
import cheerio from 'cheerio'
import RootExtractor from './root-extractor'
import { select } from './root-extractor'
import {
cleanBySelectors,
transformElements
@ -125,11 +126,51 @@ describe('transformElements($content, $, { transforms })', () => {
})
})
export function clean(string) {
describe('select(opts)', () => {
it(`returns a node's text with a simple selector`, () => {
const html = `
<div><div class="author">Bob</div></div>
`
const $ = cheerio.load(html)
const opts = {
type: 'author',
$,
extractionOpts: {
selectors: ['.author']
}
}
const result = select(opts)
assert.equal(result, 'Bob')
})
it(`returns a node's attr with a attr selector`, () => {
const html = `
<div>
<time datetime="2016-09-07T05:07:59-04:00">
September 7, 2016
</time>
</div>
`
const $ = cheerio.load(html)
const opts = {
type: 'datePublished',
$,
extractionOpts: {
selectors: ['time[datetime]']
}
}
const result = select(opts)
assert.equal(result, '2016-09-07T09:07:59.000Z')
})
})
function clean(string) {
return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ')
}
export function assertClean(a, b) {
function assertClean(a, b) {
assert.equal(clean(a), clean(b))
}

Loading…
Cancel
Save