feat: blogspot.com custom extractor
parent
6c6451b34b
commit
9665fe7209
@ -1,8 +1,10 @@
|
||||
import GenericExtractor from './generic'
|
||||
import NYMagExtractor from './custom/nymag.com'
|
||||
import BloggerExtractor from './custom/blogspot.com'
|
||||
|
||||
const Extractors = {
|
||||
'nymag.com': NYMagExtractor,
|
||||
'blogspot.com': BloggerExtractor,
|
||||
}
|
||||
|
||||
export default Extractors
|
||||
|
@ -0,0 +1,40 @@
|
||||
const BloggerExtractor = {
|
||||
domain: 'blogspot.com',
|
||||
content: {
|
||||
// Blogger is insane and does not load its content
|
||||
// initially in the page, but it's all there
|
||||
// in noscript
|
||||
selectors: [
|
||||
'.post-content noscript',
|
||||
],
|
||||
|
||||
// Selectors to remove from the extracted content
|
||||
clean: [
|
||||
],
|
||||
|
||||
// Convert the noscript tag to a div
|
||||
transforms: {
|
||||
'noscript': 'div'
|
||||
},
|
||||
},
|
||||
|
||||
author: {
|
||||
selectors: [
|
||||
'.post-author-name'
|
||||
]
|
||||
},
|
||||
|
||||
title: {
|
||||
selectors: [
|
||||
'h2.title',
|
||||
]
|
||||
},
|
||||
|
||||
datePublished: {
|
||||
selectors: [
|
||||
'span.publishdate',
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
export default BloggerExtractor
|
Loading…
Reference in New Issue