From 9665fe7209d54ccfb9759f9415de42fc0a14d16c Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Thu, 8 Sep 2016 12:19:54 -0400 Subject: [PATCH] feat: blogspot.com custom extractor --- src/extractor/all.js | 2 ++ src/extractor/custom/blogspot.com/index.js | 40 ++++++++++++++++++++++ src/extractor/custom/nymag.com/index.js | 28 +++++++-------- src/extractor/get-extractor.js | 3 +- src/extractor/get-extractor.test.js | 6 ++++ src/iris.js | 3 ++ src/{index.test.js => iris.test.js} | 6 ++++ 7 files changed, 72 insertions(+), 16 deletions(-) create mode 100644 src/extractor/custom/blogspot.com/index.js rename src/{index.test.js => iris.test.js} (65%) diff --git a/src/extractor/all.js b/src/extractor/all.js index bbda959d..0deddcbb 100644 --- a/src/extractor/all.js +++ b/src/extractor/all.js @@ -1,8 +1,10 @@ import GenericExtractor from './generic' import NYMagExtractor from './custom/nymag.com' +import BloggerExtractor from './custom/blogspot.com' const Extractors = { 'nymag.com': NYMagExtractor, + 'blogspot.com': BloggerExtractor, } export default Extractors diff --git a/src/extractor/custom/blogspot.com/index.js b/src/extractor/custom/blogspot.com/index.js new file mode 100644 index 00000000..20a294ae --- /dev/null +++ b/src/extractor/custom/blogspot.com/index.js @@ -0,0 +1,40 @@ +const BloggerExtractor = { + domain: 'blogspot.com', + content: { + // Blogger is insane and does not load its content + // initially in the page, but it's all there + // in noscript + selectors: [ + '.post-content noscript', + ], + + // Selectors to remove from the extracted content + clean: [ + ], + + // Convert the noscript tag to a div + transforms: { + 'noscript': 'div' + }, + }, + + author: { + selectors: [ + '.post-author-name' + ] + }, + + title: { + selectors: [ + 'h2.title', + ] + }, + + datePublished: { + selectors: [ + 'span.publishdate', + ] + } +} + +export default BloggerExtractor diff --git a/src/extractor/custom/nymag.com/index.js b/src/extractor/custom/nymag.com/index.js index 910094e9..6b709231 100644 --- a/src/extractor/custom/nymag.com/index.js +++ b/src/extractor/custom/nymag.com/index.js @@ -14,26 +14,24 @@ const NYMagExtractor = { '.single-related-story', ], - // Array of tranformations to make on matched elements - // Each item in the array is an object. They key is the - // selector, the value is a tranformation function - // for the matching node. - transforms: [ + // Object of tranformations to make on matched elements + // Each key is the selector, each value is the tag to + // transform to. + // If a function is given, it should return a string + // to convert to or nothing (in which case it will not perform + // the transformation. + transforms: { // Convert h1s to h2s - { - 'h1': 'h2' - }, + 'h1': 'h2', // Convert lazy-loaded noscript images to figures - { - 'noscript': ($node) => { - const $children = $node.children() - if ($children.length === 1 && $children.get(0).tagName === 'img') { - return 'figure' - } + 'noscript': ($node) => { + const $children = $node.children() + if ($children.length === 1 && $children.get(0).tagName === 'img') { + return 'figure' } } - ] + } }, title: { diff --git a/src/extractor/get-extractor.js b/src/extractor/get-extractor.js index 5ce7e7a8..e69d9e7a 100644 --- a/src/extractor/get-extractor.js +++ b/src/extractor/get-extractor.js @@ -6,6 +6,7 @@ import GenericExtractor from './generic' export default function getExtractor(url) { const parsedUrl = URL.parse(url) const { hostname } = parsedUrl + const baseDomain = hostname.split('.').slice(-2).join('.') - return Extractors[hostname] || GenericExtractor + return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor } diff --git a/src/extractor/get-extractor.test.js b/src/extractor/get-extractor.test.js index c3151bf0..b278775c 100644 --- a/src/extractor/get-extractor.test.js +++ b/src/extractor/get-extractor.test.js @@ -14,4 +14,10 @@ describe('getExtractor(url)', () => { assert.equal(extractor.domain, 'nymag.com') }) + + it('falls back to base domain if subdomain not found', () => { + const extractor = getExtractor('https://googleblog.blogspot.com') + + assert.equal(extractor.domain, 'blogspot.com') + }) }) diff --git a/src/iris.js b/src/iris.js index 82135ae6..fe04ad6f 100644 --- a/src/iris.js +++ b/src/iris.js @@ -9,7 +9,10 @@ import fetchResource from './resource/utils/fetch-resource' const Iris = { parse: async function(url, html) { const $ = await Resource.create(url, html) + html = $.html() + const Extractor = getExtractor(url) + console.log(`Using extractor for ${Extractor.domain}`) // Cached value of every meta name in our document. // Used when extracting title/author/date_published/dek diff --git a/src/index.test.js b/src/iris.test.js similarity index 65% rename from src/index.test.js rename to src/iris.test.js index 5ace027c..d836014e 100644 --- a/src/index.test.js +++ b/src/iris.test.js @@ -10,5 +10,11 @@ describe('Iris', function() { // console.log(result) }) + + it('does blogger', async function() { + const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html') + + // console.log(result) + }) }) })