mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
chore: remove logic for fetching meta tags with custom attrs (resource
normalizes this now
This commit is contained in:
parent
c48e3485c0
commit
7b97559778
2
TODO.md
2
TODO.md
@ -1,5 +1,4 @@
|
||||
TODO:
|
||||
- remove logic for fetching meta attrs with custom props
|
||||
- extractNextPageUrl
|
||||
- Rename all cleaners from cleanThing to clean
|
||||
- Make sure weightNodes flag is being passed properly
|
||||
@ -10,6 +9,7 @@ TODO:
|
||||
- Separate constants into activity-specific folders (dom, scoring)
|
||||
|
||||
DONE:
|
||||
x remove logic for fetching meta attrs with custom props
|
||||
x cleaning embed and object nodes
|
||||
x run makeLinksAbsolute on extracted content before returning
|
||||
x add option to fetch attrs in RootExtractor's select method
|
||||
|
@ -15,7 +15,6 @@ describe('GenericExtractor', () => {
|
||||
author,
|
||||
datePublished,
|
||||
dek,
|
||||
leadImageUrl,
|
||||
} = GenericExtractor.extract(
|
||||
{ url: "http://latimes.com", html, metaCache: [] }
|
||||
)
|
||||
@ -30,7 +29,6 @@ describe('GenericExtractor', () => {
|
||||
'2009-10-14T04:00:00.000Z'
|
||||
)
|
||||
assert.equal(dek, null)
|
||||
assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
|
||||
})
|
||||
|
||||
it("extracts html and returns the article title", () => {
|
||||
@ -41,7 +39,6 @@ describe('GenericExtractor', () => {
|
||||
title,
|
||||
datePublished,
|
||||
dek,
|
||||
leadImageUrl,
|
||||
} = GenericExtractor.extract(
|
||||
{ url: "http://wired.com", html, metaCache: [] }
|
||||
)
|
||||
@ -53,7 +50,6 @@ describe('GenericExtractor', () => {
|
||||
)
|
||||
assert.equal(datePublished, null)
|
||||
assert.equal(dek, null)
|
||||
assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg')
|
||||
})
|
||||
|
||||
})
|
||||
|
@ -2,8 +2,8 @@
|
||||
// All attributes should be lowercase for faster case-insensitive matching.
|
||||
// From most distinct to least distinct.
|
||||
export const LEAD_IMAGE_URL_META_TAGS = [
|
||||
['og:image', 'property', 'content'],
|
||||
['twitter:image', 'name', 'content'],
|
||||
'og:image',
|
||||
'twitter:image',
|
||||
'image_src',
|
||||
]
|
||||
|
||||
|
@ -3,7 +3,7 @@ const HTML = {
|
||||
test: `
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:image" content="http://example.com/lead.jpg">
|
||||
<meta name="og:image" value="http://example.com/lead.jpg">
|
||||
</head>
|
||||
</html>
|
||||
`,
|
||||
@ -13,7 +13,7 @@ const HTML = {
|
||||
test: `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="twitter:image" content="http://example.com/lead.jpg">
|
||||
<meta name="twitter:image" value="http://example.com/lead.jpg">
|
||||
</head>
|
||||
</html>
|
||||
`,
|
||||
|
@ -15,26 +15,14 @@ export default function extractFromMeta(
|
||||
cleanTags=true,
|
||||
) {
|
||||
const foundNames = metaNames.filter(name => {
|
||||
const metaType = typeof name
|
||||
|
||||
if (metaType === 'string') {
|
||||
return cachedNames.indexOf(name) !== -1
|
||||
} else if (metaType === 'object') {
|
||||
return cachedNames.indexOf(name[0]) !== 1
|
||||
}
|
||||
return cachedNames.indexOf(name) !== -1
|
||||
})
|
||||
|
||||
for (let name of foundNames) {
|
||||
let type, value
|
||||
|
||||
if (typeof name === 'string') {
|
||||
type = 'name'
|
||||
value = 'value'
|
||||
} else {
|
||||
type = name[1]
|
||||
value = name[2]
|
||||
name = name[0]
|
||||
}
|
||||
type = 'name'
|
||||
value = 'value'
|
||||
|
||||
const nodes = $(`meta[${type}="${name}"]`)
|
||||
|
||||
|
@ -32,16 +32,6 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
|
||||
assert.equal(result, HTML.metaEmptyDupes.result)
|
||||
})
|
||||
|
||||
it('accepts custom attributes', () => {
|
||||
const $ = cheerio.load(HTML.custom.test)
|
||||
const metaNames = [['foo', 'property', 'content']]
|
||||
const cachedNames = ['foo']
|
||||
const result = extractFromMeta(
|
||||
$, metaNames, cachedNames
|
||||
)
|
||||
|
||||
assert.equal(result, HTML.custom.result)
|
||||
})
|
||||
})
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user