diff --git a/NOTES.md b/NOTES.md new file mode 100644 index 00000000..824379b3 --- /dev/null +++ b/NOTES.md @@ -0,0 +1,84 @@ +Each extractor should ultimately be an object that exports like so: + +```javascript +import GenericContentExtractor from './content/extractor' +import GenericTitleExtractor from './title/extractor' +import GenericAuthorExtractor from './author/extractor' +import GenericDatePublishedExtractor from './date-published/extractor' +import GenericDekExtractor from './dek/extractor' +import GenericLeadImageUrlExtractor from './lead-image-url/extractor' + +const GenericExtractor = { + content: GenericContentExtractor, + title: GenericTitleExtractor, + author: GenericAuthorExtractor, + datePublished: GenericDatePublishedExtractor, + dek: GenericDekExtractor, + leadImageUrl: GenericLeadImageUrlExtractor, +} +``` + +Custom parsers can then be merged with the generic parser to fill in gaps in their implementations. E.g: + +```javascript +import NYMagContentExtractor from '...' +import NYMagTitleExtractor from '...' + +const NYMagExtractor = { + content: NYMagContentExtractor, + title: NYMagTitleExtractor, +} + +const Extractor = { + ...GenericExtractor, + ...NYMagExtractor +} + +``` + +# Declarative Custom Extractors + +My goal is be to create declarative extractors that describe what rather than how. So, for example: + +```javascript +NYMagExtractor = { + content: { + // Order by most likely. Extractor will stop on first occurence + selectors: [ + 'div.article-content', + 'section.body', + 'article.article', + ], + + // Selectors to remove from the extracted content + clean: [ + '.ad', + ], + + // Array of tranformations to make on matched elements + // Each item in the array is an object. They key is the + // selector, the value is a tranformation function + // for the matching node. + transforms: [ + // Convert h1s to h2s + { + 'h1': ($node) => convertNodeTo($node, $, 'h2') + }, + + // Convert lazy-loaded noscript images to figures + { + 'noscript': ($node) => { + const $children = $node.children() + if ($children.length === 1 && $children.get(0).tagName === 'img') { + convertNodeTo($node, $, 'figure') + } + } + } + ] + }, + + title: [ + 'h1', + ] +} +``` diff --git a/TODO.md b/TODO.md index 909e75e1..aebeb01f 100644 --- a/TODO.md +++ b/TODO.md @@ -1,6 +1,7 @@ TODO: - Tmrw: - - extractNextPageUrl +- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff +- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc) +- extractNextPageUrl - Try Closure webpack compiler - Rename all cleaners from cleanThing to clean - Make sure weightNodes flag is being passed properly @@ -10,7 +11,6 @@ TODO: - Test if .is method is faster than regex methods - Separate constants into activity-specific folders (dom, scoring) - DONE: x extractLeadImageUrl x extractDek diff --git a/src/extractor/generic/index.js b/src/extractor/generic/index.js index ace7c7f9..b14c389c 100644 --- a/src/extractor/generic/index.js +++ b/src/extractor/generic/index.js @@ -20,7 +20,7 @@ const GenericExtractor = { // Cached value of every meta name in our document. // Used when extracting title/author/date_published/dek - const metaCache = $('meta').map((index, node) => { + const metaCache = $('meta').map((_, node) => { return $(node).attr('name') }).toArray() diff --git a/src/extractor/generic/index.test.js b/src/extractor/generic/index.test.js index a22a2f45..ffd4868f 100644 --- a/src/extractor/generic/index.test.js +++ b/src/extractor/generic/index.test.js @@ -24,7 +24,7 @@ describe('GenericExtractor', () => { 'California appears poised to be first to ban power-guzzling big-screen TVs' ) assert.equal( - datePublished.toISOString(), + datePublished, '2009-10-14T04:00:00.000Z' ) assert.equal(dek, null)