feat: GenericExtractLeadImageUrl

Squashed commit of the following:

commit 22d37ebf26dbbd0a3daebbfde3509a6ce04aaf72
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 1 17:50:13 2016 -0400

    feat: GenericExtractLeadImageUrl

commit 3327a0a7929dd0e9267dc9c26f4e2aa78c32586f
Author: Adam Pash <adam.pash@gmail.com>
Date:   Thu Sep 1 15:33:42 2016 -0400

    feat: can pass custom attributes to extractFromMeta
pull/1/head
Adam Pash 8 years ago
parent 467b600721
commit 0ff3082295

@ -1,9 +1,8 @@
TODO:
Tmrw:
- extractDek
- extractNextPageUrl
- extractLeadImageUrl
- Try Closure webpack compiler
- Rename all cleaners from cleanThing to clean
- Make sure weightNodes flag is being passed properly
- Get better sense of when cheerio returns a raw node and when a cheerio object
- Remove $ from function calls to getScore
@ -13,6 +12,8 @@ TODO:
DONE:
x extractLeadImageUrl
x extractDek
x extractDatePublished
x Title metadata
x Test re-initializing $ if/when it needs to loop again

@ -21,6 +21,7 @@
"cheerio": "^0.20.0",
"moment": "^2.14.1",
"rollup": "^0.34.10",
"valid-url": "^1.0.9",
"wuzzy": "^0.1.2"
}
}

@ -1,26 +1,3 @@
// An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
export const LEAD_IMAGE_URL_META_TAGS = [
//'og:image',
'image_src',
]
// An ordered list of XPath Selectors to find likely article deks. From
// most explicit to least explicit.
//
// Should be more restrictive than not, as a failed dek can be pretty
// detrimental to the aesthetics of an article.
export const LEAD_IMAGE_URL_SELECTORS = [
{
//selector: '//link[@rel="image_src"]',
}, // hentry microformat
]
//// CONTENT FETCHING CONSTANTS ////
// A list of strings that can be considered unlikely candidates when
@ -188,43 +165,6 @@ export const PHOTO_HINTS = [
]
export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'upload',
'wp-content',
'large',
'photo',
'wp-image',
]
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'spacer',
'sprite',
'blank',
'throbber',
'gradient',
'tile',
'bg',
'background',
'icon',
'social',
'header',
'hdr',
'advert',
'spinner',
'loader',
'loading',
'default',
'rating',
'share',
'facebook',
'twitter',
'theme',
'promo',
'ads',
'wp-includes',
]
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
// A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.

@ -1,4 +1,8 @@
import moment from 'moment'
// Is there a compelling reason to use moment here?
// Mostly only being used for the isValid() method,
// but could just check for 'Invalid Date' string.
import {
CLEAN_DATE_STRING_RE,
TIME_MERIDIAN_RE

@ -5,6 +5,7 @@ import GenericTitleExtractor from './title/extractor'
import GenericAuthorExtractor from './author/extractor'
import GenericDatePublishedExtractor from './date-published/extractor'
import GenericDekExtractor from './dek/extractor'
import GenericLeadImageUrlExtractor from './lead-image-url/extractor'
const GenericExtractor = {
parse: (url, html) => {
@ -28,6 +29,8 @@ const GenericExtractor = {
GenericDatePublishedExtractor.extract($, url, metaCache)
const author = GenericAuthorExtractor.extract($, metaCache)
const content = GenericContentExtractor.parse($, html)
const leadImageUrl =
GenericLeadImageUrlExtractor.extract($, content, metaCache)
const dek = GenericDekExtractor.extract($, metaCache, content)
return {
@ -35,6 +38,7 @@ const GenericExtractor = {
author,
datePublished,
dek,
leadImageUrl,
content,
}
}

@ -15,6 +15,7 @@ describe('GenericExtractor', () => {
author,
datePublished,
dek,
leadImageUrl,
} = GenericExtractor.parse("http://latimes.com", html)
assert.equal(author, null)
@ -27,6 +28,7 @@ describe('GenericExtractor', () => {
'2009-10-14T04:00:00.000Z'
)
assert.equal(dek, null)
assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
})
it("parses html and returns the article title", () => {
@ -37,6 +39,7 @@ describe('GenericExtractor', () => {
title,
datePublished,
dek,
leadImageUrl,
} = GenericExtractor.parse("http://wired.com", html)
assert.equal(author, 'Eric Adams')
@ -46,6 +49,7 @@ describe('GenericExtractor', () => {
)
assert.equal(datePublished, null)
assert.equal(dek, null)
assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg')
})
})

@ -0,0 +1,10 @@
import validUrl from 'valid-url'
export default function clean(leadImageUrl) {
leadImageUrl = leadImageUrl.trim()
if (validUrl.isWebUri(leadImageUrl)) {
return leadImageUrl
} else {
return null
}
}

@ -0,0 +1,20 @@
import assert from 'assert'
import clean from './clean'
describe('clean(leadImageUrl)', () => {
it('returns the url if valid', () => {
const url = 'https://example.com'
assert.equal(clean(url), url)
})
it('returns null if the url is not valid', () => {
const url = 'this is not a valid url'
assert.equal(clean(url), null)
})
it('trims whitespace', () => {
const url = ' https://example.com/foo/bar.jpg'
assert.equal(clean(url), url.trim())
})
})

@ -0,0 +1,53 @@
// An ordered list of meta tag names that denote likely article leading images.
// All attributes should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
export const LEAD_IMAGE_URL_META_TAGS = [
['og:image', 'property', 'content'],
['twitter:image', 'name', 'content'],
'image_src',
]
export const LEAD_IMAGE_URL_SELECTORS = [
'link[rel=image_src]',
]
export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'upload',
'wp-content',
'large',
'photo',
'wp-image',
]
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'spacer',
'sprite',
'blank',
'throbber',
'gradient',
'tile',
'bg',
'background',
'icon',
'social',
'header',
'hdr',
'advert',
'spinner',
'loader',
'loading',
'default',
'rating',
'share',
'facebook',
'twitter',
'theme',
'promo',
'ads',
'wp-includes',
]
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
export const GIF_RE = /\gif$/i
export const JPG_RE = /\jpe?g$/i

@ -0,0 +1,268 @@
import 'babel-polyfill'
import {
LEAD_IMAGE_URL_META_TAGS,
LEAD_IMAGE_URL_SELECTORS,
} from './constants'
import {
extractFromMeta,
extractFromSelectors
} from '../utils'
import {
scoreImageUrl,
scoreAttr,
scoreByParents,
scoreBySibling,
scoreByDimensions,
scoreByPosition,
} from './score-image'
import clean from './clean'
// Given a resource, try to find the lead image URL from within
// it. Like content and next page extraction, uses a scoring system
// to determine what the most likely image may be. Short circuits
// on really probable things like og:image meta tags.
//
// Potential signals to still take advantage of:
// * domain
// * weird aspect ratio
const GenericLeadImageUrlExtractor = {
extract($, content, cachedMeta) {
let imageUrl, cleanUrl
// Check to see if we have a matching meta tag that we can make use of.
// Moving this higher because common practice is now to use large
// images on things like Open Graph or Twitter cards.
// images usually have for things like Open Graph.
imageUrl =
extractFromMeta(
$,
LEAD_IMAGE_URL_META_TAGS,
cachedMeta,
false
)
if (imageUrl) {
cleanUrl = clean(imageUrl)
if (cleanUrl) return cleanUrl
}
// Next, try to find the "best" image via the content.
// We'd rather not have to fetch each image and check dimensions,
// so try to do some analysis and determine them instead.
const imgs = $('img', content).toArray()
let imgScores = {}
imgs.forEach((img, index) => {
const $img = $(img)
const src = $img.attr('src')
if (!src) return
let score = scoreImageUrl(src)
score = score + scoreAttr($img)
score = score + scoreByParents($img)
score = score + scoreBySibling($img)
score = score + scoreByDimensions($img)
score = score + scoreByPosition(imgs, index)
imgScores[src] = score
})
const [topUrl, topScore] =
Reflect.ownKeys(imgScores).reduce((acc, key) =>
imgScores[key] > acc[1] ? [key, imgScores[key]] : acc
, [null, 0])
if (topScore > 0) {
cleanUrl = clean(topUrl)
if (cleanUrl) return cleanUrl
}
// If nothing else worked, check to see if there are any really
// probable nodes in the doc, like <link rel="image_src" />.
for (const selector of LEAD_IMAGE_URL_SELECTORS) {
const $node = $(selector).first()
const src = $node.attr('src')
if (src) {
cleanUrl = clean(src)
if (cleanUrl) return cleanUrl
}
const href = $node.attr('href')
if (href) {
cleanUrl = clean(href)
if (cleanUrl) return cleanUrl
}
const value = $node.attr('value')
if (value) {
cleanUrl = clean(value)
if (cleanUrl) return cleanUrl
}
}
},
}
export default GenericLeadImageUrlExtractor
// def extract(self):
// """
// # First, try to find the "best" image via the content.
// # We'd rather not have to fetch each image and check dimensions,
// # so try to do some analysis and determine them instead.
// content = self.extractor.extract_content(return_type="node")
// imgs = content.xpath('.//img')
// img_scores = defaultdict(int)
// logger.debug('Scoring %d images from content', len(imgs))
// for (i, img) in enumerate(imgs):
// img_score = 0
//
// if not 'src' in img.attrib:
// logger.debug('No src attribute found')
// continue
//
// try:
// parsed_img = urlparse(img.attrib['src'])
// img_path = parsed_img.path.lower()
// except ValueError:
// logger.debug('ValueError getting img path.')
// continue
// logger.debug('Image path is %s', img_path)
//
// if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
// logger.debug('Positive URL hints match. Adding 20.')
// img_score += 20
//
// if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
// logger.debug('Negative URL hints match. Subtracting 20.')
// img_score -= 20
//
// # Gifs are more often structure than photos
// if img_path.endswith('gif'):
// logger.debug('gif found. Subtracting 10.')
// img_score -= 10
//
// # JPGs are more often photographs
// if img_path.endswith('jpg'):
// logger.debug('jpg found. Adding 10.')
// img_score += 10
//
// # PNGs are neutral.
//
// # Alt attribute usually means non-presentational image.
// if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
// logger.debug('alt attribute found. Adding 5.')
// img_score += 5
//
// # Look through our parent and grandparent for figure-like
// # container elements, give a bonus if we find them
// parents = [img.getparent()]
// if parents[0] is not None and parents[0].getparent() is not None:
// parents.append(parents[0].getparent())
// for p in parents:
// if p.tag == 'figure':
// logger.debug('Parent with <figure> tag found. Adding 25.')
// img_score += 25
//
// p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
// if constants.PHOTO_HINTS_RE.search(p_sig):
// logger.debug('Photo hints regex match. Adding 15.')
// img_score += 15
//
// # Look at our immediate sibling and see if it looks like it's a
// # caption. Bonus if so.
// sibling = img.getnext()
// if sibling is not None:
// if sibling.tag == 'figcaption':
// img_score += 25
//
// sib_sig = ' '.join([sibling.get('id', ''),
// sibling.get('class', '')]).lower()
// if 'caption' in sib_sig:
// img_score += 15
//
// # Pull out width/height if they were set.
// img_width = None
// img_height = None
// if 'width' in img.attrib:
// try:
// img_width = float(img.get('width'))
// except ValueError:
// pass
// if 'height' in img.attrib:
// try:
// img_height = float(img.get('height'))
// except ValueError:
// pass
//
// # Penalty for skinny images
// if img_width and img_width <= 50:
// logger.debug('Skinny image found. Subtracting 50.')
// img_score -= 50
//
// # Penalty for short images
// if img_height and img_height <= 50:
// # Wide, short images are more common than narrow, tall ones
// logger.debug('Short image found. Subtracting 25.')
// img_score -= 25
//
// if img_width and img_height and not 'sprite' in img_path:
// area = img_width * img_height
//
// if area < 5000: # Smaller than 50x100
// logger.debug('Image with small area found. Subtracting 100.')
// img_score -= 100
// else:
// img_score += round(area/1000.0)
//
// # If the image is higher on the page than other images,
// # it gets a bonus. Penalty if lower.
// logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
// img_score += len(imgs)/2 - i
//
// # Use the raw src here because we munged img_path for case
// # insensitivity
// logger.debug('Final score is %d.', img_score)
// img_scores[img.attrib['src']] += img_score
//
// top_score = 0
// top_url = None
// for (url, score) in img_scores.items():
// if score > top_score:
// top_url = url
// top_score = score
//
// if top_score > 0:
// logger.debug('Using top score image from content. Score was %d', top_score)
// return top_url
//
//
// # If nothing else worked, check to see if there are any really
// # probable nodes in the doc, like <link rel="image_src" />.
// logger.debug('Trying to find lead image in probable nodes')
// for selector in constants.LEAD_IMAGE_URL_SELECTORS:
// nodes = self.resource.extract_by_selector(selector)
// for node in nodes:
// clean_value = None
// if node.attrib.get('src'):
// clean_value = self.clean(node.attrib['src'])
//
// if not clean_value and node.attrib.get('href'):
// clean_value = self.clean(node.attrib['href'])
//
// if not clean_value and node.attrib.get('value'):
// clean_value = self.clean(node.attrib['value'])
//
// if clean_value:
// logger.debug('Found lead image in probable nodes.')
// logger.debug('Node was: %s', node)
// return clean_value
//
// return None

@ -0,0 +1,54 @@
import assert from 'assert'
import cheerio from 'cheerio'
import HTML from './fixtures/html'
import GenericLeadImageUrlExtractor from './extractor'
describe('GenericLeadImageUrlExtractor', () => {
describe('extract($, content, cachedMeta)', () => {
it('returns og:image first', () => {
const $ = cheerio.load(HTML.og.test)
const content = $('*').first()
const cachedMeta = ['og:image']
const result =
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
assert.equal(result, HTML.og.result)
})
it('returns twitter:image', () => {
const $ = cheerio.load(HTML.twitter.test)
const content = $('*').first()
const cachedMeta = ['twitter:image']
const result =
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
assert.equal(result, HTML.twitter.result)
})
it('finds images based on scoring', () => {
const $ = cheerio.load(HTML.scoring.test)
const content = $('*').first()
const cachedMeta = []
const result =
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
assert.equal(result, HTML.scoring.result)
})
it('returns image based on selectors', () => {
const $ = cheerio.load(HTML.selectors.test)
const content = $('*').first()
const cachedMeta = []
const result =
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
assert.equal(result, HTML.selectors.result)
})
})
})

@ -0,0 +1,42 @@
const HTML = {
og: {
test: `
<html>
<head>
<meta property="og:image" content="http://example.com/lead.jpg">
</head>
</html>
`,
result: `http://example.com/lead.jpg`
},
twitter: {
test: `
<html>
<head>
<meta name="twitter:image" content="http://example.com/lead.jpg">
</head>
</html>
`,
result: `http://example.com/lead.jpg`
},
scoring: {
test: `
<div>
<img src="http://example.com/sprite/abadpic.jpg" />
<img src="http://example.com/upload/goodpic.jpg" />
<img src="http://example.com/upload/whateverpic.png" />
</div>
`,
result: `http://example.com/upload/goodpic.jpg`
},
selectors: {
test: `
<div>
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
</div>
`,
result: `http://example.com/upload/goodpic.jpg`
},
}
export default HTML

@ -0,0 +1,125 @@
import {
POSITIVE_LEAD_IMAGE_URL_HINTS_RE,
NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,
GIF_RE,
JPG_RE,
} from './constants'
import { PHOTO_HINTS_RE } from '../content/utils/constants'
// Scores image urls based on a variety of heuristics.
export function scoreImageUrl(url) {
url = url.trim()
let score = 0
if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score + 20
}
if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
score = score - 20
}
// TODO: We might want to consider removing this as
// gifs are much more common/popular than they once were
if (GIF_RE.test(url)) {
score = score - 10
}
if (JPG_RE.test(url)) {
score = score + 10
}
// PNGs are neutral.
return score
}
// Alt attribute usually means non-presentational image.
export function scoreAttr($img) {
if ($img.attr('alt')) {
return 5
} else {
return 0
}
}
// Look through our parent and grandparent for figure-like
// container elements, give a bonus if we find them
export function scoreByParents($img) {
let score = 0
const $figParent = $img.parents('figure').first()
if ($figParent.length === 1) {
score = score + 25
}
const $parent = $img.parent()
let $gParent
if ($parent.length === 1) {
$gParent = $parent.parent()
}
[$parent, $gParent].forEach($node => {
if (PHOTO_HINTS_RE.test(getSig($node))) {
score = score + 15
}
})
return score
}
// Look at our immediate sibling and see if it looks like it's a
// caption. Bonus if so.
export function scoreBySibling($img) {
let score = 0
const $sibling = $img.next()
const sibling = $sibling.get(0)
if (sibling && sibling.tagName === 'figcaption') {
score = score + 25
}
if (PHOTO_HINTS_RE.test(getSig($sibling))) {
score = score + 15
}
return score
}
export function scoreByDimensions($img) {
let score = 0
const width = parseFloat($img.attr('width'))
const height = parseFloat($img.attr('height'))
const src = $img.attr('src')
// Penalty for skinny images
if (width && width <= 50) {
score = score - 50
}
// Penalty for short images
if (height && height <= 50) {
score = score - 50
}
if (width && height && !src.includes('sprite')) {
const area = width * height
if (area < 5000) { // Smaller than 50 x 100
score = score - 100
} else {
score = score + Math.round(area/1000)
}
}
return score
}
export function scoreByPosition($imgs, index) {
return $imgs.length/2 - index
}
function getSig($node) {
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`
}

@ -0,0 +1,222 @@
import assert from 'assert'
import cheerio from 'cheerio'
import {
scoreImageUrl,
scoreAttr,
scoreByParents,
scoreBySibling,
scoreByDimensions,
scoreByPosition,
} from './score-image'
describe('scoreImageUrlUrl(url)', () => {
it('gets 20 points for a positive lead img hint', () => {
const url = 'http://example.com/upload/img.png'
assert.equal(scoreImageUrl(url), 20)
})
it('loses 20 points for a negative lead img hint', () => {
const url = 'http://example.com/sprite/foo/bar.png'
assert.equal(scoreImageUrl(url), -20)
})
it('loses 10 points for a gif', () => {
const url = 'http://example.com/foo/bar.gif'
assert.equal(scoreImageUrl(url), -10)
const url2 = 'http://example.com/foogif/bar'
assert.equal(scoreImageUrl(url2), 0)
})
it('gains 10 points for a jpg', () => {
const url = 'http://example.com/foo/bar.jpg'
assert.equal(scoreImageUrl(url), 10)
const url2 = 'http://example.com/foo/bar.jpeg'
assert.equal(scoreImageUrl(url2), 10)
const url3 = 'http://example.com/foojpg/bar'
assert.equal(scoreImageUrl(url3), 0)
})
})
describe('scoreAttr($img)', () => {
it('gets 5 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img alt="Wow" /></div>')
const $img = $('img').first()
assert.equal(scoreAttr($img), 5)
})
it('gets 0 points if the img node has an alt attribute', () => {
const $ = cheerio.load('<div><img /></div>')
const $img = $('img').first()
assert.equal(scoreAttr($img), 0)
})
})
describe('scoreByParents($img)', () => {
it('gets 25 points if it has a figure parent', () => {
const $ = cheerio.load(
`<div>
<figure>
<div>
<img alt="Wow" />
</div>
</figure>
</div>`
)
const $img = $('img').first()
assert.equal(scoreByParents($img), 25)
})
it('gets 0 points if the img has no figure parent', () => {
const $ = cheerio.load('<div><img /></div>')
const $img = $('img').first()
assert.equal(scoreByParents($img), 0)
})
it('gets 15 points if parent or gparent has photo hints', () => {
const $ = cheerio.load(
`<div>
<div class="figure">
<div>
<img alt="Wow" />
</div>
</div>
</div>`
)
const $img = $('img').first()
assert.equal(scoreByParents($img), 15)
})
})
describe('scoreBySibling($img)', () => {
it('gets 25 points if its sibling is figcaption', () => {
const $ = cheerio.load(
`
<div>
<img />
<figcaption>Wow</figcaption>
</div>
`
)
const $img = $('img').first()
assert.equal(scoreBySibling($img), 25)
})
it('gets 15 points if its sibling has photo hints', () => {
const $ = cheerio.load(
`<div>
<div>
<img alt="Wow" />
<div class="caption">
Wow
</div>
</div>
</div>`
)
const $img = $('img').first()
assert.equal(scoreBySibling($img), 15)
})
})
describe('scoreByDimensions($img)', () => {
it('penalizes skinny images', () => {
const $ = cheerio.load(
`
<div>
<img width="10" />
</div>
`
)
const $img = $('img').first()
assert.equal(scoreByDimensions($img), -50)
})
it('penalizes short images', () => {
const $ = cheerio.load(
`
<div>
<img height="10" />
</div>
`
)
const $img = $('img').first()
assert.equal(scoreByDimensions($img), -50)
})
it('ignores sprites', () => {
const $ = cheerio.load(
`
<div>
<img src="/sprite/etc/foo.png" width="1000" height="1000" />
</div>
`
)
const $img = $('img').first()
assert.equal(scoreByDimensions($img), 0)
})
it('penalizes images with small areas', () => {
const $ = cheerio.load(
`
<div>
<img src="/etc/foo.png" width="60" height="60" />
</div>
`
)
const $img = $('img').first()
assert.equal(scoreByDimensions($img), -100)
})
it('prefers the largest images', () => {
const $ = cheerio.load(
`
<div>
<img src="/etc/foo.png" width="1000" height="1000" />
</div>
`
)
const $img = $('img').first()
assert.equal(scoreByDimensions($img), 1000)
})
})
describe('scoreByPosition($imgs, index)', () => {
it('gives higher scores to images that come first', () => {
const $ = cheerio.load(
`
<div>
<img width="10" />
<img width="10" />
<img width="10" />
<img width="10" />
<img width="10" />
<img width="10" />
</div>
`
)
const $imgs = $('img')
assert.equal(scoreByPosition($imgs, 0), 3)
})
})

@ -2,25 +2,47 @@ import { stripTags } from '../../utils'
// Given a node type to search for, and a list of meta tag names to
// search for, find a meta tag associated.
// metaNames can be an array of strings of an array of three-element
// arrays that will define the attributes to select from the meta
// elements. E.g., ['og:image', 'property', 'content'] will search
// $('meta[property=og:image]').attr('content').
//
// Default is $('meta[name=og:image]').attr(value)
export default function extractFromMeta(
$,
metaNames,
cachedNames,
cleanTags=true
cleanTags=true,
) {
const foundNames = metaNames.filter(name =>
cachedNames.indexOf(name) !== -1
)
let metaValue
for (const name of foundNames) {
const nodes = $(`meta[name="${name}"]`)
const foundNames = metaNames.filter(name => {
const metaType = typeof name
if (metaType === 'string') {
return cachedNames.indexOf(name) !== -1
} else if (metaType === 'object') {
return cachedNames.indexOf(name[0]) !== 1
}
})
for (let name of foundNames) {
let type, value
if (typeof name === 'string') {
type = 'name'
value = 'value'
} else {
type = name[1]
value = name[2]
name = name[0]
}
const nodes = $(`meta[${type}="${name}"]`)
// Get the unique value of every matching node, in case there
// are two meta tags with the same name and value.
// Remove empty values.
const values =
$(`meta[name="${name}"]`).map((index, node) => $(node).attr('value'))
nodes.map((index, node) => $(node).attr(value))
.toArray()
.filter(text => text !== '')
@ -32,6 +54,7 @@ export default function extractFromMeta(
continue
}
let metaValue
// Meta values that contain HTML should be stripped, as they
// weren't subject to cleaning previously.
if (cleanTags) {

@ -32,6 +32,16 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
assert.equal(result, HTML.metaEmptyDupes.result)
})
it('accepts custom attributes', () => {
const $ = cheerio.load(HTML.custom.test)
const metaNames = [['foo', 'property', 'content']]
const cachedNames = ['foo']
const result = extractFromMeta(
$, metaNames, cachedNames
)
assert.equal(result, HTML.custom.result)
})
})

@ -23,6 +23,13 @@ const HTML = {
</html>`,
result: `bar`,
},
custom: {
test: `
<html>
<meta property="foo" content="bar" />
</html>`,
result: `bar`,
},
// extractFromSelectors
simpleSelector: {

Loading…
Cancel
Save