mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
feat: blogspot.com custom extractor
This commit is contained in:
parent
6c6451b34b
commit
9665fe7209
@ -1,8 +1,10 @@
|
||||
import GenericExtractor from './generic'
|
||||
import NYMagExtractor from './custom/nymag.com'
|
||||
import BloggerExtractor from './custom/blogspot.com'
|
||||
|
||||
const Extractors = {
|
||||
'nymag.com': NYMagExtractor,
|
||||
'blogspot.com': BloggerExtractor,
|
||||
}
|
||||
|
||||
export default Extractors
|
||||
|
40
src/extractor/custom/blogspot.com/index.js
Normal file
40
src/extractor/custom/blogspot.com/index.js
Normal file
@ -0,0 +1,40 @@
|
||||
const BloggerExtractor = {
|
||||
domain: 'blogspot.com',
|
||||
content: {
|
||||
// Blogger is insane and does not load its content
|
||||
// initially in the page, but it's all there
|
||||
// in noscript
|
||||
selectors: [
|
||||
'.post-content noscript',
|
||||
],
|
||||
|
||||
// Selectors to remove from the extracted content
|
||||
clean: [
|
||||
],
|
||||
|
||||
// Convert the noscript tag to a div
|
||||
transforms: {
|
||||
'noscript': 'div'
|
||||
},
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'.post-author-name'
|
||||
]
|
||||
},
|
||||
|
||||
title: {
|
||||
selectors: [
|
||||
'h2.title',
|
||||
]
|
||||
},
|
||||
|
||||
datePublished: {
|
||||
selectors: [
|
||||
'span.publishdate',
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
export default BloggerExtractor
|
@ -14,26 +14,24 @@ const NYMagExtractor = {
|
||||
'.single-related-story',
|
||||
],
|
||||
|
||||
// Array of tranformations to make on matched elements
|
||||
// Each item in the array is an object. They key is the
|
||||
// selector, the value is a tranformation function
|
||||
// for the matching node.
|
||||
transforms: [
|
||||
// Object of tranformations to make on matched elements
|
||||
// Each key is the selector, each value is the tag to
|
||||
// transform to.
|
||||
// If a function is given, it should return a string
|
||||
// to convert to or nothing (in which case it will not perform
|
||||
// the transformation.
|
||||
transforms: {
|
||||
// Convert h1s to h2s
|
||||
{
|
||||
'h1': 'h2'
|
||||
},
|
||||
'h1': 'h2',
|
||||
|
||||
// Convert lazy-loaded noscript images to figures
|
||||
{
|
||||
'noscript': ($node) => {
|
||||
const $children = $node.children()
|
||||
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||
return 'figure'
|
||||
}
|
||||
'noscript': ($node) => {
|
||||
const $children = $node.children()
|
||||
if ($children.length === 1 && $children.get(0).tagName === 'img') {
|
||||
return 'figure'
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
title: {
|
||||
|
@ -6,6 +6,7 @@ import GenericExtractor from './generic'
|
||||
export default function getExtractor(url) {
|
||||
const parsedUrl = URL.parse(url)
|
||||
const { hostname } = parsedUrl
|
||||
const baseDomain = hostname.split('.').slice(-2).join('.')
|
||||
|
||||
return Extractors[hostname] || GenericExtractor
|
||||
return Extractors[hostname] || Extractors[baseDomain] || GenericExtractor
|
||||
}
|
||||
|
@ -14,4 +14,10 @@ describe('getExtractor(url)', () => {
|
||||
|
||||
assert.equal(extractor.domain, 'nymag.com')
|
||||
})
|
||||
|
||||
it('falls back to base domain if subdomain not found', () => {
|
||||
const extractor = getExtractor('https://googleblog.blogspot.com')
|
||||
|
||||
assert.equal(extractor.domain, 'blogspot.com')
|
||||
})
|
||||
})
|
||||
|
@ -9,7 +9,10 @@ import fetchResource from './resource/utils/fetch-resource'
|
||||
const Iris = {
|
||||
parse: async function(url, html) {
|
||||
const $ = await Resource.create(url, html)
|
||||
html = $.html()
|
||||
|
||||
const Extractor = getExtractor(url)
|
||||
console.log(`Using extractor for ${Extractor.domain}`)
|
||||
|
||||
// Cached value of every meta name in our document.
|
||||
// Used when extracting title/author/date_published/dek
|
||||
|
@ -10,5 +10,11 @@ describe('Iris', function() {
|
||||
|
||||
// console.log(result)
|
||||
})
|
||||
|
||||
it('does blogger', async function() {
|
||||
const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html')
|
||||
|
||||
// console.log(result)
|
||||
})
|
||||
})
|
||||
})
|
Loading…
Reference in New Issue
Block a user