From bdc2c0c1da4f2e1970c34dd33b5a5ab6e2c59680 Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Fri, 9 Sep 2016 10:25:12 -0400 Subject: [PATCH] feat: can now fetch attrs in RootExtractor's select method --- TODO.md | 1 + src/extractor/constants.js | 1 + src/extractor/custom/nymag.com/index.js | 13 +++++++ src/extractor/root-extractor.js | 16 +++++++-- src/extractor/root-extractor.test.js | 45 +++++++++++++++++++++++-- 5 files changed, 71 insertions(+), 5 deletions(-) create mode 100644 src/extractor/constants.js diff --git a/TODO.md b/TODO.md index 38a71233..f5c7f8bf 100644 --- a/TODO.md +++ b/TODO.md @@ -12,6 +12,7 @@ TODO: - Separate constants into activity-specific folders (dom, scoring) DONE: +x add option to fetch attrs in RootExtractor's select method x get custom datePublished selector to convert to date object (prob through cleaner) x extract and generalize cleaners x move arguments to cleaners to object diff --git a/src/extractor/constants.js b/src/extractor/constants.js new file mode 100644 index 00000000..b6fc067b --- /dev/null +++ b/src/extractor/constants.js @@ -0,0 +1 @@ +export const ATTR_RE = /\[([\w-]+)\]/ diff --git a/src/extractor/custom/nymag.com/index.js b/src/extractor/custom/nymag.com/index.js index 6b709231..d96a4bc6 100644 --- a/src/extractor/custom/nymag.com/index.js +++ b/src/extractor/custom/nymag.com/index.js @@ -39,6 +39,19 @@ const NYMagExtractor = { 'h1.headline-primary', 'h1', ] + }, + + author: { + selectors: [ + '.by-authors', + ] + }, + + datePublished: { + selectors: [ + 'time.article-timestamp[datetime]', + 'time.article-timestamp', + ] } } diff --git a/src/extractor/root-extractor.js b/src/extractor/root-extractor.js index c6f53480..5635dfd3 100644 --- a/src/extractor/root-extractor.js +++ b/src/extractor/root-extractor.js @@ -3,6 +3,7 @@ import 'babel-polyfill' import GenericExtractor from './generic' import Cleaners from '../cleaners' import { convertNodeTo, stripTags } from './utils/dom' +import { ATTR_RE } from './constants' const RootExtractor = { extract(extractor=GenericExtractor, opts) { @@ -44,7 +45,7 @@ function extract(opts) { GenericExtractor[type](opts) } -function select(opts) { +export function select(opts) { const { $, type, extractionOpts, extractHtml=false } = opts // Skip if there's not extraction for this type if (!extractionOpts) return @@ -75,8 +76,17 @@ function select(opts) { return $.html($content) } else { - // return stripTags($(matchingSelector).text(), $) - return Cleaners[type]($(matchingSelector).text(), opts) + // if selector includes an attr (e.g., img[src]), + // extract the attr + const attr = matchingSelector.match(ATTR_RE) + let result + if (attr) { + result = $(matchingSelector).attr(attr[1]) + } else { + // otherwise use the text of the node + result = $(matchingSelector).text() + } + return Cleaners[type](result, opts) } } diff --git a/src/extractor/root-extractor.test.js b/src/extractor/root-extractor.test.js index 1b86ebb7..aad473fe 100644 --- a/src/extractor/root-extractor.test.js +++ b/src/extractor/root-extractor.test.js @@ -3,6 +3,7 @@ import fs from 'fs' import cheerio from 'cheerio' import RootExtractor from './root-extractor' +import { select } from './root-extractor' import { cleanBySelectors, transformElements @@ -125,11 +126,51 @@ describe('transformElements($content, $, { transforms })', () => { }) }) -export function clean(string) { +describe('select(opts)', () => { + it(`returns a node's text with a simple selector`, () => { + const html = ` +
Bob
+ ` + const $ = cheerio.load(html) + const opts = { + type: 'author', + $, + extractionOpts: { + selectors: ['.author'] + } + } + + const result = select(opts) + assert.equal(result, 'Bob') + }) + + it(`returns a node's attr with a attr selector`, () => { + const html = ` +
+ +
+ ` + const $ = cheerio.load(html) + const opts = { + type: 'datePublished', + $, + extractionOpts: { + selectors: ['time[datetime]'] + } + } + + const result = select(opts) + assert.equal(result, '2016-09-07T09:07:59.000Z') + }) +}) + +function clean(string) { return string.trim().replace(/\r?\n|\r/g, '').replace(/\s+/g, ' ') } -export function assertClean(a, b) { +function assertClean(a, b) { assert.equal(clean(a), clean(b)) }