mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
refactor: preparing for extraction merging
This commit is contained in:
parent
a022252a14
commit
0f45b39ca2
@ -13,13 +13,6 @@ const GenericContentExtractor = {
|
||||
cleanConditionally: true,
|
||||
},
|
||||
|
||||
// Entry point for parsing html
|
||||
parse($, html, title='', opts={}) {
|
||||
opts = { ...this.defaultOpts, ...opts }
|
||||
|
||||
return this.extract($, html, opts, title)
|
||||
},
|
||||
|
||||
// Extract the content for this resource - initially, pass in our
|
||||
// most restrictive opts which will return the highest quality
|
||||
// content. On each failure, retry with slightly more lax opts.
|
||||
@ -39,7 +32,9 @@ const GenericContentExtractor = {
|
||||
//
|
||||
// cleanConditionally: Clean the node to return of some
|
||||
// superfluous content. Things like forms, ads, etc.
|
||||
extract($, html, opts, title) {
|
||||
extract($, html, opts, title='') {
|
||||
opts = { ...this.defaultOpts, ...opts }
|
||||
|
||||
$ = $ || cheerio.load(html)
|
||||
|
||||
// Cascade through our extraction-specific opts in an ordered fashion,
|
||||
|
@ -8,15 +8,15 @@ import GenericContentExtractor from './extractor'
|
||||
|
||||
describe('GenericContentExtractor', function() {
|
||||
this.timeout(1000000)
|
||||
describe('parse(html, opts)', () => {
|
||||
it("parses html and returns the article", () => {
|
||||
describe('extract($, html, opts)', () => {
|
||||
it("extracts html and returns the article", () => {
|
||||
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
|
||||
|
||||
// Array.from(range(1, 100)).map((i) => {
|
||||
// console.log(i)
|
||||
// clean(GenericContentExtractor.parse(null, html))
|
||||
// clean(GenericContentExtractor.extract(null, html))
|
||||
// })
|
||||
const result = clean(GenericContentExtractor.parse(null, html))
|
||||
const result = clean(GenericContentExtractor.extract(null, html))
|
||||
// console.log(result)
|
||||
})
|
||||
})
|
||||
|
@ -17,7 +17,7 @@ import {
|
||||
// until we have a more robust generic option.
|
||||
// Below is the original source for this, for reference.
|
||||
const GenericDekExtractor = {
|
||||
extract($, cachedMeta, content) {
|
||||
extract($, content, cachedMeta) {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
@ -10,14 +10,16 @@ import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
|
||||
const GenericExtractor = {
|
||||
// This extractor is the default for all domains
|
||||
domain: '*',
|
||||
title: GenericTitleExtractor.extract,
|
||||
datePublished : GenericDatePublishedExtractor.extract,
|
||||
author: GenericAuthorExtractor.extract,
|
||||
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
|
||||
leadImageUrl: GenericLeadImageUrlExtractor.extract,
|
||||
dek: GenericDekExtractor.extract,
|
||||
|
||||
parse: (url, html, $) => {
|
||||
parse: function(url, html, $) {
|
||||
if (html) {
|
||||
$ = cheerio.load(html)
|
||||
} else {
|
||||
// TODO
|
||||
// Fetch link, following redirects
|
||||
// to return html and initialize $
|
||||
}
|
||||
|
||||
// Cached value of every meta name in our document.
|
||||
@ -26,14 +28,12 @@ const GenericExtractor = {
|
||||
return $(node).attr('name')
|
||||
}).toArray()
|
||||
|
||||
const title = GenericTitleExtractor.extract($, url, metaCache)
|
||||
const datePublished =
|
||||
GenericDatePublishedExtractor.extract($, url, metaCache)
|
||||
const author = GenericAuthorExtractor.extract($, metaCache)
|
||||
const content = GenericContentExtractor.parse($, html)
|
||||
const leadImageUrl =
|
||||
GenericLeadImageUrlExtractor.extract($, content, metaCache)
|
||||
const dek = GenericDekExtractor.extract($, metaCache, content)
|
||||
const title = this.title($, url, metaCache)
|
||||
const datePublished = this.datePublished($, url, metaCache)
|
||||
const author = this.author($, metaCache)
|
||||
const content = this.content($, html)
|
||||
const leadImageUrl = this.leadImageUrl($, content, metaCache)
|
||||
const dek = this.dek($, content, metaCache)
|
||||
|
||||
return {
|
||||
title,
|
||||
|
Loading…
Reference in New Issue
Block a user