feat: blogspot.com custom extractor

This commit is contained in:
Adam Pash 2016-09-08 12:19:54 -04:00
parent 6c6451b34b
commit 9665fe7209
7 changed files with 72 additions and 16 deletions

View File

@ -1,8 +1,10 @@
import GenericExtractor from './generic'
import NYMagExtractor from './custom/nymag.com'
import BloggerExtractor from './custom/blogspot.com'
const Extractors = {
'nymag.com': NYMagExtractor,
'blogspot.com': BloggerExtractor,
}
export default Extractors

View File

@ -0,0 +1,40 @@
const BloggerExtractor = {
domain: 'blogspot.com',
content: {
// Blogger is insane and does not load its content
// initially in the page, but it's all there
// in noscript
selectors: [
'.post-content noscript',
],
// Selectors to remove from the extracted content
clean: [
],
// Convert the noscript tag to a div
transforms: {
'noscript': 'div'
},
},
author: {
selectors: [
'.post-author-name'
]
},
title: {
selectors: [
'h2.title',
]
},
datePublished: {
selectors: [
'span.publishdate',
]
}
}
export default BloggerExtractor

View File

@ -14,26 +14,24 @@ const NYMagExtractor = {
'.single-related-story',
],
// Array of tranformations to make on matched elements
// Each item in the array is an object. They key is the
// selector, the value is a tranformation function
// for the matching node.
transforms: [
// Object of tranformations to make on matched elements
// Each key is the selector, each value is the tag to
// transform to.
// If a function is given, it should return a string
// to convert to or nothing (in which case it will not perform
// the transformation.
transforms: {
// Convert h1s to h2s
{
'h1': 'h2'
},
'h1': 'h2',
// Convert lazy-loaded noscript images to figures
{
'noscript': ($node) => {
const $children = $node.children()
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure'
}
'noscript': ($node) => {
const $children = $node.children()
if ($children.length === 1 && $children.get(0).tagName === 'img') {
return 'figure'
}
}
]
}
},
title: {

View File

@ -6,6 +6,7 @@ import GenericExtractor from './generic'
export default function getExtractor(url) {
const parsedUrl = URL.parse(url)
const { hostname } = parsedUrl
const baseDomain = hostname.split('.').slice(-2).join('.')
return Extractors[hostname] || GenericExtractor
return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor
}

View File

@ -14,4 +14,10 @@ describe('getExtractor(url)', () => {
assert.equal(extractor.domain, 'nymag.com')
})
it('falls back to base domain if subdomain not found', () => {
const extractor = getExtractor('https://googleblog.blogspot.com')
assert.equal(extractor.domain, 'blogspot.com')
})
})

View File

@ -9,7 +9,10 @@ import fetchResource from './resource/utils/fetch-resource'
const Iris = {
parse: async function(url, html) {
const $ = await Resource.create(url, html)
html = $.html()
const Extractor = getExtractor(url)
console.log(`Using extractor for ${Extractor.domain}`)
// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek

View File

@ -10,5 +10,11 @@ describe('Iris', function() {
// console.log(result)
})
it('does blogger', async function() {
const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html')
// console.log(result)
})
})
})