mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
chore: remove logic for fetching meta tags with custom attrs (resource
normalizes this now
This commit is contained in:
parent
c48e3485c0
commit
7b97559778
2
TODO.md
2
TODO.md
@ -1,5 +1,4 @@
|
|||||||
TODO:
|
TODO:
|
||||||
- remove logic for fetching meta attrs with custom props
|
|
||||||
- extractNextPageUrl
|
- extractNextPageUrl
|
||||||
- Rename all cleaners from cleanThing to clean
|
- Rename all cleaners from cleanThing to clean
|
||||||
- Make sure weightNodes flag is being passed properly
|
- Make sure weightNodes flag is being passed properly
|
||||||
@ -10,6 +9,7 @@ TODO:
|
|||||||
- Separate constants into activity-specific folders (dom, scoring)
|
- Separate constants into activity-specific folders (dom, scoring)
|
||||||
|
|
||||||
DONE:
|
DONE:
|
||||||
|
x remove logic for fetching meta attrs with custom props
|
||||||
x cleaning embed and object nodes
|
x cleaning embed and object nodes
|
||||||
x run makeLinksAbsolute on extracted content before returning
|
x run makeLinksAbsolute on extracted content before returning
|
||||||
x add option to fetch attrs in RootExtractor's select method
|
x add option to fetch attrs in RootExtractor's select method
|
||||||
|
@ -15,7 +15,6 @@ describe('GenericExtractor', () => {
|
|||||||
author,
|
author,
|
||||||
datePublished,
|
datePublished,
|
||||||
dek,
|
dek,
|
||||||
leadImageUrl,
|
|
||||||
} = GenericExtractor.extract(
|
} = GenericExtractor.extract(
|
||||||
{ url: "http://latimes.com", html, metaCache: [] }
|
{ url: "http://latimes.com", html, metaCache: [] }
|
||||||
)
|
)
|
||||||
@ -30,7 +29,6 @@ describe('GenericExtractor', () => {
|
|||||||
'2009-10-14T04:00:00.000Z'
|
'2009-10-14T04:00:00.000Z'
|
||||||
)
|
)
|
||||||
assert.equal(dek, null)
|
assert.equal(dek, null)
|
||||||
assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
|
|
||||||
})
|
})
|
||||||
|
|
||||||
it("extracts html and returns the article title", () => {
|
it("extracts html and returns the article title", () => {
|
||||||
@ -41,7 +39,6 @@ describe('GenericExtractor', () => {
|
|||||||
title,
|
title,
|
||||||
datePublished,
|
datePublished,
|
||||||
dek,
|
dek,
|
||||||
leadImageUrl,
|
|
||||||
} = GenericExtractor.extract(
|
} = GenericExtractor.extract(
|
||||||
{ url: "http://wired.com", html, metaCache: [] }
|
{ url: "http://wired.com", html, metaCache: [] }
|
||||||
)
|
)
|
||||||
@ -53,7 +50,6 @@ describe('GenericExtractor', () => {
|
|||||||
)
|
)
|
||||||
assert.equal(datePublished, null)
|
assert.equal(datePublished, null)
|
||||||
assert.equal(dek, null)
|
assert.equal(dek, null)
|
||||||
assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg')
|
|
||||||
})
|
})
|
||||||
|
|
||||||
})
|
})
|
||||||
|
@ -2,8 +2,8 @@
|
|||||||
// All attributes should be lowercase for faster case-insensitive matching.
|
// All attributes should be lowercase for faster case-insensitive matching.
|
||||||
// From most distinct to least distinct.
|
// From most distinct to least distinct.
|
||||||
export const LEAD_IMAGE_URL_META_TAGS = [
|
export const LEAD_IMAGE_URL_META_TAGS = [
|
||||||
['og:image', 'property', 'content'],
|
'og:image',
|
||||||
['twitter:image', 'name', 'content'],
|
'twitter:image',
|
||||||
'image_src',
|
'image_src',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -3,7 +3,7 @@ const HTML = {
|
|||||||
test: `
|
test: `
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<meta property="og:image" content="http://example.com/lead.jpg">
|
<meta name="og:image" value="http://example.com/lead.jpg">
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
`,
|
`,
|
||||||
@ -13,7 +13,7 @@ const HTML = {
|
|||||||
test: `
|
test: `
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<meta name="twitter:image" content="http://example.com/lead.jpg">
|
<meta name="twitter:image" value="http://example.com/lead.jpg">
|
||||||
</head>
|
</head>
|
||||||
</html>
|
</html>
|
||||||
`,
|
`,
|
||||||
|
@ -15,26 +15,14 @@ export default function extractFromMeta(
|
|||||||
cleanTags=true,
|
cleanTags=true,
|
||||||
) {
|
) {
|
||||||
const foundNames = metaNames.filter(name => {
|
const foundNames = metaNames.filter(name => {
|
||||||
const metaType = typeof name
|
return cachedNames.indexOf(name) !== -1
|
||||||
|
|
||||||
if (metaType === 'string') {
|
|
||||||
return cachedNames.indexOf(name) !== -1
|
|
||||||
} else if (metaType === 'object') {
|
|
||||||
return cachedNames.indexOf(name[0]) !== 1
|
|
||||||
}
|
|
||||||
})
|
})
|
||||||
|
|
||||||
for (let name of foundNames) {
|
for (let name of foundNames) {
|
||||||
let type, value
|
let type, value
|
||||||
|
|
||||||
if (typeof name === 'string') {
|
type = 'name'
|
||||||
type = 'name'
|
value = 'value'
|
||||||
value = 'value'
|
|
||||||
} else {
|
|
||||||
type = name[1]
|
|
||||||
value = name[2]
|
|
||||||
name = name[0]
|
|
||||||
}
|
|
||||||
|
|
||||||
const nodes = $(`meta[${type}="${name}"]`)
|
const nodes = $(`meta[${type}="${name}"]`)
|
||||||
|
|
||||||
|
@ -32,16 +32,6 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
|
|||||||
assert.equal(result, HTML.metaEmptyDupes.result)
|
assert.equal(result, HTML.metaEmptyDupes.result)
|
||||||
})
|
})
|
||||||
|
|
||||||
it('accepts custom attributes', () => {
|
|
||||||
const $ = cheerio.load(HTML.custom.test)
|
|
||||||
const metaNames = [['foo', 'property', 'content']]
|
|
||||||
const cachedNames = ['foo']
|
|
||||||
const result = extractFromMeta(
|
|
||||||
$, metaNames, cachedNames
|
|
||||||
)
|
|
||||||
|
|
||||||
assert.equal(result, HTML.custom.result)
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user