chore: remove logic for fetching meta tags with custom attrs (resource

normalizes this now
This commit is contained in:
Adam Pash 2016-09-09 13:56:06 -04:00
parent c48e3485c0
commit 7b97559778
6 changed files with 8 additions and 34 deletions

View File

@ -1,5 +1,4 @@
TODO:
- remove logic for fetching meta attrs with custom props
- extractNextPageUrl
- Rename all cleaners from cleanThing to clean
- Make sure weightNodes flag is being passed properly
@ -10,6 +9,7 @@ TODO:
- Separate constants into activity-specific folders (dom, scoring)
DONE:
x remove logic for fetching meta attrs with custom props
x cleaning embed and object nodes
x run makeLinksAbsolute on extracted content before returning
x add option to fetch attrs in RootExtractor's select method

View File

@ -15,7 +15,6 @@ describe('GenericExtractor', () => {
author,
datePublished,
dek,
leadImageUrl,
} = GenericExtractor.extract(
{ url: "http://latimes.com", html, metaCache: [] }
)
@ -30,7 +29,6 @@ describe('GenericExtractor', () => {
'2009-10-14T04:00:00.000Z'
)
assert.equal(dek, null)
assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
})
it("extracts html and returns the article title", () => {
@ -41,7 +39,6 @@ describe('GenericExtractor', () => {
title,
datePublished,
dek,
leadImageUrl,
} = GenericExtractor.extract(
{ url: "http://wired.com", html, metaCache: [] }
)
@ -53,7 +50,6 @@ describe('GenericExtractor', () => {
)
assert.equal(datePublished, null)
assert.equal(dek, null)
assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg')
})
})

View File

@ -2,8 +2,8 @@
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
export const LEAD_IMAGE_URL_META_TAGS = [
['og:image', 'property', 'content'],
['twitter:image', 'name', 'content'],
'og:image',
'twitter:image',
'image_src',
]

View File

@ -3,7 +3,7 @@ const HTML = {
test: `
<html>
<head>
<meta property="og:image" content="http://example.com/lead.jpg">
<meta name="og:image" value="http://example.com/lead.jpg">
</head>
</html>
`,
@ -13,7 +13,7 @@ const HTML = {
test: `
<html>
<head>
<meta name="twitter:image" content="http://example.com/lead.jpg">
<meta name="twitter:image" value="http://example.com/lead.jpg">
</head>
</html>
`,

View File

@ -15,26 +15,14 @@ export default function extractFromMeta(
cleanTags=true,
) {
const foundNames = metaNames.filter(name => {
const metaType = typeof name
if (metaType === 'string') {
return cachedNames.indexOf(name) !== -1
} else if (metaType === 'object') {
return cachedNames.indexOf(name[0]) !== 1
}
return cachedNames.indexOf(name) !== -1
})
for (let name of foundNames) {
let type, value
if (typeof name === 'string') {
type = 'name'
value = 'value'
} else {
type = name[1]
value = name[2]
name = name[0]
}
type = 'name'
value = 'value'
const nodes = $(`meta[${type}="${name}"]`)

View File

@ -32,16 +32,6 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
assert.equal(result, HTML.metaEmptyDupes.result)
})
it('accepts custom attributes', () => {
const $ = cheerio.load(HTML.custom.test)
const metaNames = [['foo', 'property', 'content']]
const cachedNames = ['foo']
const result = extractFromMeta(
$, metaNames, cachedNames
)
assert.equal(result, HTML.custom.result)
})
})