feat: blogspot.com custom extractor

2024-11-17 03:25:31 +00:00 · 2016-09-08 12:19:54 -04:00 · 2016-09-08 12:19:54 -04:00 · 9665fe7209
commit 9665fe7209
parent 6c6451b34b
7 changed files with 72 additions and 16 deletions
--- a/src/extractor/all.js
+++ b/src/extractor/all.js
@ -1,8 +1,10 @@
 import GenericExtractor from './generic'
 import NYMagExtractor from './custom/nymag.com'
+import BloggerExtractor from './custom/blogspot.com'

 const Extractors = {
  'nymag.com': NYMagExtractor,
+  'blogspot.com': BloggerExtractor,
 }

 export default Extractors
--- a/src/extractor/custom/blogspot.com/index.js
+++ b/src/extractor/custom/blogspot.com/index.js
@ -0,0 +1,40 @@
+const BloggerExtractor = {
+  domain: 'blogspot.com',
+  content: {
+    // Blogger is insane and does not load its content
+    // initially in the page, but it's all there
+    // in noscript
+    selectors: [
+      '.post-content noscript',
+    ],
+
+    // Selectors to remove from the extracted content
+    clean: [
+    ],
+
+    // Convert the noscript tag to a div
+    transforms: {
+      'noscript': 'div'
+    },
+  },
+
+  author: {
+    selectors: [
+      '.post-author-name'
+    ]
+  },
+
+  title: {
+    selectors: [
+      'h2.title',
+    ]
+  },
+
+  datePublished: {
+    selectors: [
+      'span.publishdate',
+    ]
+  }
+}
+
+export default BloggerExtractor
--- a/src/extractor/custom/nymag.com/index.js
+++ b/src/extractor/custom/nymag.com/index.js
@ -14,26 +14,24 @@ const NYMagExtractor = {
      '.single-related-story',
    ],

-    // Array of tranformations to make on matched elements
-    // Each item in the array is an object. They key is the
-    // selector, the value is a tranformation function
-    // for the matching node.
-    transforms: [
+    // Object of tranformations to make on matched elements
+    // Each key is the selector, each value is the tag to
+    // transform to.
+    // If a function is given, it should return a string
+    // to convert to or nothing (in which case it will not perform
+    // the transformation.
+    transforms: {
      // Convert h1s to h2s
-      {
-        'h1': 'h2'
-      },
+      'h1': 'h2',

      // Convert lazy-loaded noscript images to figures
-      {
-        'noscript': ($node) => {
-          const $children = $node.children()
-          if ($children.length === 1 && $children.get(0).tagName === 'img') {
-            return 'figure'
-          }
+      'noscript': ($node) => {
+        const $children = $node.children()
+        if ($children.length === 1 && $children.get(0).tagName === 'img') {
+          return 'figure'
        }
      }
-    ]
+    }
  },

  title: {
--- a/src/extractor/get-extractor.js
+++ b/src/extractor/get-extractor.js
@ -6,6 +6,7 @@ import GenericExtractor from './generic'
 export default function getExtractor(url) {
  const parsedUrl = URL.parse(url)
  const { hostname } = parsedUrl
+  const baseDomain = hostname.split('.').slice(-2).join('.')

-  return Extractors[hostname] || GenericExtractor
+  return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor
 }
--- a/src/extractor/get-extractor.test.js
+++ b/src/extractor/get-extractor.test.js
@ -14,4 +14,10 @@ describe('getExtractor(url)', () => {

    assert.equal(extractor.domain, 'nymag.com')
  })
+
+  it('falls back to base domain if subdomain not found', () => {
+    const extractor = getExtractor('https://googleblog.blogspot.com')
+
+    assert.equal(extractor.domain, 'blogspot.com')
+  })
 })
--- a/src/iris.js
+++ b/src/iris.js
@ -9,7 +9,10 @@ import fetchResource from './resource/utils/fetch-resource'
 const Iris = {
  parse: async function(url, html) {
    const $ = await Resource.create(url, html)
+    html = $.html()
+
    const Extractor = getExtractor(url)
+    console.log(`Using extractor for ${Extractor.domain}`)

    // Cached value of every meta name in our document.
    // Used when extracting title/author/date_published/dek
--- a/src/index.test.js
+++ b/src/index.test.js
@ -10,5 +10,11 @@ describe('Iris', function() {

      // console.log(result)
    })
+
+    it('does blogger', async function() {
+      const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html')
+
+      // console.log(result)
+    })
  })
 })