mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-05 12:00:13 +00:00
refactor: preparing for extraction merging
This commit is contained in:
parent
a022252a14
commit
0f45b39ca2
@ -13,13 +13,6 @@ const GenericContentExtractor = {
|
|||||||
cleanConditionally: true,
|
cleanConditionally: true,
|
||||||
},
|
},
|
||||||
|
|
||||||
// Entry point for parsing html
|
|
||||||
parse($, html, title='', opts={}) {
|
|
||||||
opts = { ...this.defaultOpts, ...opts }
|
|
||||||
|
|
||||||
return this.extract($, html, opts, title)
|
|
||||||
},
|
|
||||||
|
|
||||||
// Extract the content for this resource - initially, pass in our
|
// Extract the content for this resource - initially, pass in our
|
||||||
// most restrictive opts which will return the highest quality
|
// most restrictive opts which will return the highest quality
|
||||||
// content. On each failure, retry with slightly more lax opts.
|
// content. On each failure, retry with slightly more lax opts.
|
||||||
@ -39,7 +32,9 @@ const GenericContentExtractor = {
|
|||||||
//
|
//
|
||||||
// cleanConditionally: Clean the node to return of some
|
// cleanConditionally: Clean the node to return of some
|
||||||
// superfluous content. Things like forms, ads, etc.
|
// superfluous content. Things like forms, ads, etc.
|
||||||
extract($, html, opts, title) {
|
extract($, html, opts, title='') {
|
||||||
|
opts = { ...this.defaultOpts, ...opts }
|
||||||
|
|
||||||
$ = $ || cheerio.load(html)
|
$ = $ || cheerio.load(html)
|
||||||
|
|
||||||
// Cascade through our extraction-specific opts in an ordered fashion,
|
// Cascade through our extraction-specific opts in an ordered fashion,
|
||||||
|
@ -8,15 +8,15 @@ import GenericContentExtractor from './extractor'
|
|||||||
|
|
||||||
describe('GenericContentExtractor', function() {
|
describe('GenericContentExtractor', function() {
|
||||||
this.timeout(1000000)
|
this.timeout(1000000)
|
||||||
describe('parse(html, opts)', () => {
|
describe('extract($, html, opts)', () => {
|
||||||
it("parses html and returns the article", () => {
|
it("extracts html and returns the article", () => {
|
||||||
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
|
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
|
||||||
|
|
||||||
// Array.from(range(1, 100)).map((i) => {
|
// Array.from(range(1, 100)).map((i) => {
|
||||||
// console.log(i)
|
// console.log(i)
|
||||||
// clean(GenericContentExtractor.parse(null, html))
|
// clean(GenericContentExtractor.extract(null, html))
|
||||||
// })
|
// })
|
||||||
const result = clean(GenericContentExtractor.parse(null, html))
|
const result = clean(GenericContentExtractor.extract(null, html))
|
||||||
// console.log(result)
|
// console.log(result)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
@ -17,7 +17,7 @@ import {
|
|||||||
// until we have a more robust generic option.
|
// until we have a more robust generic option.
|
||||||
// Below is the original source for this, for reference.
|
// Below is the original source for this, for reference.
|
||||||
const GenericDekExtractor = {
|
const GenericDekExtractor = {
|
||||||
extract($, cachedMeta, content) {
|
extract($, content, cachedMeta) {
|
||||||
return null
|
return null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
|
|||||||
const GenericExtractor = {
|
const GenericExtractor = {
|
||||||
// This extractor is the default for all domains
|
// This extractor is the default for all domains
|
||||||
domain: '*',
|
domain: '*',
|
||||||
|
title: GenericTitleExtractor.extract,
|
||||||
|
datePublished : GenericDatePublishedExtractor.extract,
|
||||||
|
author: GenericAuthorExtractor.extract,
|
||||||
|
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
|
||||||
|
leadImageUrl: GenericLeadImageUrlExtractor.extract,
|
||||||
|
dek: GenericDekExtractor.extract,
|
||||||
|
|
||||||
parse: (url, html, $) => {
|
parse: function(url, html, $) {
|
||||||
if (html) {
|
if (html) {
|
||||||
$ = cheerio.load(html)
|
$ = cheerio.load(html)
|
||||||
} else {
|
|
||||||
// TODO
|
|
||||||
// Fetch link, following redirects
|
|
||||||
// to return html and initialize $
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cached value of every meta name in our document.
|
// Cached value of every meta name in our document.
|
||||||
@ -26,14 +28,12 @@ const GenericExtractor = {
|
|||||||
return $(node).attr('name')
|
return $(node).attr('name')
|
||||||
}).toArray()
|
}).toArray()
|
||||||
|
|
||||||
const title = GenericTitleExtractor.extract($, url, metaCache)
|
const title = this.title($, url, metaCache)
|
||||||
const datePublished =
|
const datePublished = this.datePublished($, url, metaCache)
|
||||||
GenericDatePublishedExtractor.extract($, url, metaCache)
|
const author = this.author($, metaCache)
|
||||||
const author = GenericAuthorExtractor.extract($, metaCache)
|
const content = this.content($, html)
|
||||||
const content = GenericContentExtractor.parse($, html)
|
const leadImageUrl = this.leadImageUrl($, content, metaCache)
|
||||||
const leadImageUrl =
|
const dek = this.dek($, content, metaCache)
|
||||||
GenericLeadImageUrlExtractor.extract($, content, metaCache)
|
|
||||||
const dek = GenericDekExtractor.extract($, metaCache, content)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
title,
|
title,
|
||||||
|
Loading…
Reference in New Issue
Block a user