From 9665fe7209d54ccfb9759f9415de42fc0a14d16c Mon Sep 17 00:00:00 2001
From: Adam Pash <adam.pash@gmail.com>
Date: Thu, 8 Sep 2016 12:19:54 -0400
Subject: [PATCH] feat: blogspot.com custom extractor

---
 src/extractor/all.js                       |  2 ++
 src/extractor/custom/blogspot.com/index.js | 40 ++++++++++++++++++++++
 src/extractor/custom/nymag.com/index.js    | 28 +++++++--------
 src/extractor/get-extractor.js             |  3 +-
 src/extractor/get-extractor.test.js        |  6 ++++
 src/iris.js                                |  3 ++
 src/{index.test.js => iris.test.js}        |  6 ++++
 7 files changed, 72 insertions(+), 16 deletions(-)
 create mode 100644 src/extractor/custom/blogspot.com/index.js
 rename src/{index.test.js => iris.test.js} (65%)

diff --git a/src/extractor/all.js b/src/extractor/all.js
index bbda959d..0deddcbb 100644
--- a/src/extractor/all.js
+++ b/src/extractor/all.js
@@ -1,8 +1,10 @@
 import GenericExtractor from './generic'
 import NYMagExtractor from './custom/nymag.com'
+import BloggerExtractor from './custom/blogspot.com'
 
 const Extractors = {
   'nymag.com': NYMagExtractor,
+  'blogspot.com': BloggerExtractor,
 }
 
 export default Extractors
diff --git a/src/extractor/custom/blogspot.com/index.js b/src/extractor/custom/blogspot.com/index.js
new file mode 100644
index 00000000..20a294ae
--- /dev/null
+++ b/src/extractor/custom/blogspot.com/index.js
@@ -0,0 +1,40 @@
+const BloggerExtractor = {
+  domain: 'blogspot.com',
+  content: {
+    // Blogger is insane and does not load its content
+    // initially in the page, but it's all there
+    // in noscript
+    selectors: [
+      '.post-content noscript',
+    ],
+
+    // Selectors to remove from the extracted content
+    clean: [
+    ],
+
+    // Convert the noscript tag to a div
+    transforms: {
+      'noscript': 'div'
+    },
+  },
+
+  author: {
+    selectors: [
+      '.post-author-name'
+    ]
+  },
+
+  title: {
+    selectors: [
+      'h2.title',
+    ]
+  },
+
+  datePublished: {
+    selectors: [
+      'span.publishdate',
+    ]
+  }
+}
+
+export default BloggerExtractor
diff --git a/src/extractor/custom/nymag.com/index.js b/src/extractor/custom/nymag.com/index.js
index 910094e9..6b709231 100644
--- a/src/extractor/custom/nymag.com/index.js
+++ b/src/extractor/custom/nymag.com/index.js
@@ -14,26 +14,24 @@ const NYMagExtractor = {
       '.single-related-story',
     ],
 
-    // Array of tranformations to make on matched elements
-    // Each item in the array is an object. They key is the
-    // selector, the value is a tranformation function
-    // for the matching node.
-    transforms: [
+    // Object of tranformations to make on matched elements
+    // Each key is the selector, each value is the tag to
+    // transform to.
+    // If a function is given, it should return a string
+    // to convert to or nothing (in which case it will not perform
+    // the transformation.
+    transforms: {
       // Convert h1s to h2s
-      {
-        'h1': 'h2'
-      },
+      'h1': 'h2',
 
       // Convert lazy-loaded noscript images to figures
-      {
-        'noscript': ($node) => {
-          const $children = $node.children()
-          if ($children.length === 1 && $children.get(0).tagName === 'img') {
-            return 'figure'
-          }
+      'noscript': ($node) => {
+        const $children = $node.children()
+        if ($children.length === 1 && $children.get(0).tagName === 'img') {
+          return 'figure'
         }
       }
-    ]
+    }
   },
 
   title: {
diff --git a/src/extractor/get-extractor.js b/src/extractor/get-extractor.js
index 5ce7e7a8..e69d9e7a 100644
--- a/src/extractor/get-extractor.js
+++ b/src/extractor/get-extractor.js
@@ -6,6 +6,7 @@ import GenericExtractor from './generic'
 export default function getExtractor(url) {
   const parsedUrl = URL.parse(url)
   const { hostname } = parsedUrl
+  const baseDomain = hostname.split('.').slice(-2).join('.')
 
-  return Extractors[hostname] || GenericExtractor
+  return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor
 }
diff --git a/src/extractor/get-extractor.test.js b/src/extractor/get-extractor.test.js
index c3151bf0..b278775c 100644
--- a/src/extractor/get-extractor.test.js
+++ b/src/extractor/get-extractor.test.js
@@ -14,4 +14,10 @@ describe('getExtractor(url)', () => {
 
     assert.equal(extractor.domain, 'nymag.com')
   })
+
+  it('falls back to base domain if subdomain not found', () => {
+    const extractor = getExtractor('https://googleblog.blogspot.com')
+
+    assert.equal(extractor.domain, 'blogspot.com')
+  })
 })
diff --git a/src/iris.js b/src/iris.js
index 82135ae6..fe04ad6f 100644
--- a/src/iris.js
+++ b/src/iris.js
@@ -9,7 +9,10 @@ import fetchResource from './resource/utils/fetch-resource'
 const Iris = {
   parse: async function(url, html) {
     const $ = await Resource.create(url, html)
+    html = $.html()
+
     const Extractor = getExtractor(url)
+    console.log(`Using extractor for ${Extractor.domain}`)
 
     // Cached value of every meta name in our document.
     // Used when extracting title/author/date_published/dek
diff --git a/src/index.test.js b/src/iris.test.js
similarity index 65%
rename from src/index.test.js
rename to src/iris.test.js
index 5ace027c..d836014e 100644
--- a/src/index.test.js
+++ b/src/iris.test.js
@@ -10,5 +10,11 @@ describe('Iris', function() {
 
       // console.log(result)
     })
+
+    it('does blogger', async function() {
+      const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html')
+
+      // console.log(result)
+    })
   })
 })