chore: remove logic for fetching meta tags with custom attrs (resource

normalizes this now
This commit is contained in:
Adam Pash 2016-09-09 13:56:06 -04:00
parent c48e3485c0
commit 7b97559778
6 changed files with 8 additions and 34 deletions

View File

@ -1,5 +1,4 @@
TODO: TODO:
- remove logic for fetching meta attrs with custom props
- extractNextPageUrl - extractNextPageUrl
- Rename all cleaners from cleanThing to clean - Rename all cleaners from cleanThing to clean
- Make sure weightNodes flag is being passed properly - Make sure weightNodes flag is being passed properly
@ -10,6 +9,7 @@ TODO:
- Separate constants into activity-specific folders (dom, scoring) - Separate constants into activity-specific folders (dom, scoring)
DONE: DONE:
x remove logic for fetching meta attrs with custom props
x cleaning embed and object nodes x cleaning embed and object nodes
x run makeLinksAbsolute on extracted content before returning x run makeLinksAbsolute on extracted content before returning
x add option to fetch attrs in RootExtractor's select method x add option to fetch attrs in RootExtractor's select method

View File

@ -15,7 +15,6 @@ describe('GenericExtractor', () => {
author, author,
datePublished, datePublished,
dek, dek,
leadImageUrl,
} = GenericExtractor.extract( } = GenericExtractor.extract(
{ url: "http://latimes.com", html, metaCache: [] } { url: "http://latimes.com", html, metaCache: [] }
) )
@ -30,7 +29,6 @@ describe('GenericExtractor', () => {
'2009-10-14T04:00:00.000Z' '2009-10-14T04:00:00.000Z'
) )
assert.equal(dek, null) assert.equal(dek, null)
assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
}) })
it("extracts html and returns the article title", () => { it("extracts html and returns the article title", () => {
@ -41,7 +39,6 @@ describe('GenericExtractor', () => {
title, title,
datePublished, datePublished,
dek, dek,
leadImageUrl,
} = GenericExtractor.extract( } = GenericExtractor.extract(
{ url: "http://wired.com", html, metaCache: [] } { url: "http://wired.com", html, metaCache: [] }
) )
@ -53,7 +50,6 @@ describe('GenericExtractor', () => {
) )
assert.equal(datePublished, null) assert.equal(datePublished, null)
assert.equal(dek, null) assert.equal(dek, null)
assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg')
}) })
}) })

View File

@ -2,8 +2,8 @@
// All attributes should be lowercase for faster case-insensitive matching. // All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct. // From most distinct to least distinct.
export const LEAD_IMAGE_URL_META_TAGS = [ export const LEAD_IMAGE_URL_META_TAGS = [
['og:image', 'property', 'content'], 'og:image',
['twitter:image', 'name', 'content'], 'twitter:image',
'image_src', 'image_src',
] ]

View File

@ -3,7 +3,7 @@ const HTML = {
test: ` test: `
<html> <html>
<head> <head>
<meta property="og:image" content="http://example.com/lead.jpg"> <meta name="og:image" value="http://example.com/lead.jpg">
</head> </head>
</html> </html>
`, `,
@ -13,7 +13,7 @@ const HTML = {
test: ` test: `
<html> <html>
<head> <head>
<meta name="twitter:image" content="http://example.com/lead.jpg"> <meta name="twitter:image" value="http://example.com/lead.jpg">
</head> </head>
</html> </html>
`, `,

View File

@ -15,26 +15,14 @@ export default function extractFromMeta(
cleanTags=true, cleanTags=true,
) { ) {
const foundNames = metaNames.filter(name => { const foundNames = metaNames.filter(name => {
const metaType = typeof name return cachedNames.indexOf(name) !== -1
if (metaType === 'string') {
return cachedNames.indexOf(name) !== -1
} else if (metaType === 'object') {
return cachedNames.indexOf(name[0]) !== 1
}
}) })
for (let name of foundNames) { for (let name of foundNames) {
let type, value let type, value
if (typeof name === 'string') { type = 'name'
type = 'name' value = 'value'
value = 'value'
} else {
type = name[1]
value = name[2]
name = name[0]
}
const nodes = $(`meta[${type}="${name}"]`) const nodes = $(`meta[${type}="${name}"]`)

View File

@ -32,16 +32,6 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
assert.equal(result, HTML.metaEmptyDupes.result) assert.equal(result, HTML.metaEmptyDupes.result)
}) })
it('accepts custom attributes', () => {
const $ = cheerio.load(HTML.custom.test)
const metaNames = [['foo', 'property', 'content']]
const cachedNames = ['foo']
const result = extractFromMeta(
$, metaNames, cachedNames
)
assert.equal(result, HTML.custom.result)
})
}) })