feat: GenericExtractLeadImageUrl
Squashed commit of the following: commit 22d37ebf26dbbd0a3daebbfde3509a6ce04aaf72 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 1 17:50:13 2016 -0400 feat: GenericExtractLeadImageUrl commit 3327a0a7929dd0e9267dc9c26f4e2aa78c32586f Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 1 15:33:42 2016 -0400 feat: can pass custom attributes to extractFromMetapull/1/head
parent
467b600721
commit
0ff3082295
@ -0,0 +1,10 @@
|
||||
import validUrl from 'valid-url'
|
||||
|
||||
export default function clean(leadImageUrl) {
|
||||
leadImageUrl = leadImageUrl.trim()
|
||||
if (validUrl.isWebUri(leadImageUrl)) {
|
||||
return leadImageUrl
|
||||
} else {
|
||||
return null
|
||||
}
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
import assert from 'assert'
|
||||
|
||||
import clean from './clean'
|
||||
|
||||
describe('clean(leadImageUrl)', () => {
|
||||
it('returns the url if valid', () => {
|
||||
const url = 'https://example.com'
|
||||
assert.equal(clean(url), url)
|
||||
})
|
||||
|
||||
it('returns null if the url is not valid', () => {
|
||||
const url = 'this is not a valid url'
|
||||
assert.equal(clean(url), null)
|
||||
})
|
||||
|
||||
it('trims whitespace', () => {
|
||||
const url = ' https://example.com/foo/bar.jpg'
|
||||
assert.equal(clean(url), url.trim())
|
||||
})
|
||||
})
|
@ -0,0 +1,53 @@
|
||||
// An ordered list of meta tag names that denote likely article leading images.
|
||||
// All attributes should be lowercase for faster case-insensitive matching.
|
||||
// From most distinct to least distinct.
|
||||
export const LEAD_IMAGE_URL_META_TAGS = [
|
||||
['og:image', 'property', 'content'],
|
||||
['twitter:image', 'name', 'content'],
|
||||
'image_src',
|
||||
]
|
||||
|
||||
export const LEAD_IMAGE_URL_SELECTORS = [
|
||||
'link[rel=image_src]',
|
||||
]
|
||||
|
||||
export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
|
||||
'upload',
|
||||
'wp-content',
|
||||
'large',
|
||||
'photo',
|
||||
'wp-image',
|
||||
]
|
||||
export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
|
||||
|
||||
export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
|
||||
'spacer',
|
||||
'sprite',
|
||||
'blank',
|
||||
'throbber',
|
||||
'gradient',
|
||||
'tile',
|
||||
'bg',
|
||||
'background',
|
||||
'icon',
|
||||
'social',
|
||||
'header',
|
||||
'hdr',
|
||||
'advert',
|
||||
'spinner',
|
||||
'loader',
|
||||
'loading',
|
||||
'default',
|
||||
'rating',
|
||||
'share',
|
||||
'facebook',
|
||||
'twitter',
|
||||
'theme',
|
||||
'promo',
|
||||
'ads',
|
||||
'wp-includes',
|
||||
]
|
||||
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
|
||||
|
||||
export const GIF_RE = /\gif$/i
|
||||
export const JPG_RE = /\jpe?g$/i
|
@ -0,0 +1,268 @@
|
||||
import 'babel-polyfill'
|
||||
|
||||
import {
|
||||
LEAD_IMAGE_URL_META_TAGS,
|
||||
LEAD_IMAGE_URL_SELECTORS,
|
||||
} from './constants'
|
||||
|
||||
import {
|
||||
extractFromMeta,
|
||||
extractFromSelectors
|
||||
} from '../utils'
|
||||
|
||||
import {
|
||||
scoreImageUrl,
|
||||
scoreAttr,
|
||||
scoreByParents,
|
||||
scoreBySibling,
|
||||
scoreByDimensions,
|
||||
scoreByPosition,
|
||||
} from './score-image'
|
||||
|
||||
import clean from './clean'
|
||||
|
||||
// Given a resource, try to find the lead image URL from within
|
||||
// it. Like content and next page extraction, uses a scoring system
|
||||
// to determine what the most likely image may be. Short circuits
|
||||
// on really probable things like og:image meta tags.
|
||||
//
|
||||
// Potential signals to still take advantage of:
|
||||
// * domain
|
||||
// * weird aspect ratio
|
||||
const GenericLeadImageUrlExtractor = {
|
||||
extract($, content, cachedMeta) {
|
||||
let imageUrl, cleanUrl
|
||||
|
||||
// Check to see if we have a matching meta tag that we can make use of.
|
||||
// Moving this higher because common practice is now to use large
|
||||
// images on things like Open Graph or Twitter cards.
|
||||
// images usually have for things like Open Graph.
|
||||
imageUrl =
|
||||
extractFromMeta(
|
||||
$,
|
||||
LEAD_IMAGE_URL_META_TAGS,
|
||||
cachedMeta,
|
||||
false
|
||||
)
|
||||
|
||||
if (imageUrl) {
|
||||
cleanUrl = clean(imageUrl)
|
||||
|
||||
if (cleanUrl) return cleanUrl
|
||||
}
|
||||
|
||||
// Next, try to find the "best" image via the content.
|
||||
// We'd rather not have to fetch each image and check dimensions,
|
||||
// so try to do some analysis and determine them instead.
|
||||
const imgs = $('img', content).toArray()
|
||||
let imgScores = {}
|
||||
|
||||
imgs.forEach((img, index) => {
|
||||
const $img = $(img)
|
||||
const src = $img.attr('src')
|
||||
|
||||
if (!src) return
|
||||
|
||||
let score = scoreImageUrl(src)
|
||||
score = score + scoreAttr($img)
|
||||
score = score + scoreByParents($img)
|
||||
score = score + scoreBySibling($img)
|
||||
score = score + scoreByDimensions($img)
|
||||
score = score + scoreByPosition(imgs, index)
|
||||
|
||||
imgScores[src] = score
|
||||
})
|
||||
|
||||
const [topUrl, topScore] =
|
||||
Reflect.ownKeys(imgScores).reduce((acc, key) =>
|
||||
imgScores[key] > acc[1] ? [key, imgScores[key]] : acc
|
||||
, [null, 0])
|
||||
|
||||
if (topScore > 0) {
|
||||
cleanUrl = clean(topUrl)
|
||||
|
||||
if (cleanUrl) return cleanUrl
|
||||
}
|
||||
|
||||
// If nothing else worked, check to see if there are any really
|
||||
// probable nodes in the doc, like <link rel="image_src" />.
|
||||
for (const selector of LEAD_IMAGE_URL_SELECTORS) {
|
||||
const $node = $(selector).first()
|
||||
const src = $node.attr('src')
|
||||
if (src) {
|
||||
cleanUrl = clean(src)
|
||||
if (cleanUrl) return cleanUrl
|
||||
}
|
||||
|
||||
const href = $node.attr('href')
|
||||
if (href) {
|
||||
cleanUrl = clean(href)
|
||||
if (cleanUrl) return cleanUrl
|
||||
}
|
||||
|
||||
const value = $node.attr('value')
|
||||
if (value) {
|
||||
cleanUrl = clean(value)
|
||||
if (cleanUrl) return cleanUrl
|
||||
}
|
||||
}
|
||||
|
||||
},
|
||||
}
|
||||
|
||||
export default GenericLeadImageUrlExtractor
|
||||
|
||||
// def extract(self):
|
||||
// """
|
||||
// # First, try to find the "best" image via the content.
|
||||
// # We'd rather not have to fetch each image and check dimensions,
|
||||
// # so try to do some analysis and determine them instead.
|
||||
// content = self.extractor.extract_content(return_type="node")
|
||||
// imgs = content.xpath('.//img')
|
||||
// img_scores = defaultdict(int)
|
||||
// logger.debug('Scoring %d images from content', len(imgs))
|
||||
// for (i, img) in enumerate(imgs):
|
||||
// img_score = 0
|
||||
//
|
||||
// if not 'src' in img.attrib:
|
||||
// logger.debug('No src attribute found')
|
||||
// continue
|
||||
//
|
||||
// try:
|
||||
// parsed_img = urlparse(img.attrib['src'])
|
||||
// img_path = parsed_img.path.lower()
|
||||
// except ValueError:
|
||||
// logger.debug('ValueError getting img path.')
|
||||
// continue
|
||||
// logger.debug('Image path is %s', img_path)
|
||||
//
|
||||
// if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
|
||||
// logger.debug('Positive URL hints match. Adding 20.')
|
||||
// img_score += 20
|
||||
//
|
||||
// if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
|
||||
// logger.debug('Negative URL hints match. Subtracting 20.')
|
||||
// img_score -= 20
|
||||
//
|
||||
// # Gifs are more often structure than photos
|
||||
// if img_path.endswith('gif'):
|
||||
// logger.debug('gif found. Subtracting 10.')
|
||||
// img_score -= 10
|
||||
//
|
||||
// # JPGs are more often photographs
|
||||
// if img_path.endswith('jpg'):
|
||||
// logger.debug('jpg found. Adding 10.')
|
||||
// img_score += 10
|
||||
//
|
||||
// # PNGs are neutral.
|
||||
//
|
||||
// # Alt attribute usually means non-presentational image.
|
||||
// if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
|
||||
// logger.debug('alt attribute found. Adding 5.')
|
||||
// img_score += 5
|
||||
//
|
||||
// # Look through our parent and grandparent for figure-like
|
||||
// # container elements, give a bonus if we find them
|
||||
// parents = [img.getparent()]
|
||||
// if parents[0] is not None and parents[0].getparent() is not None:
|
||||
// parents.append(parents[0].getparent())
|
||||
// for p in parents:
|
||||
// if p.tag == 'figure':
|
||||
// logger.debug('Parent with <figure> tag found. Adding 25.')
|
||||
// img_score += 25
|
||||
//
|
||||
// p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
|
||||
// if constants.PHOTO_HINTS_RE.search(p_sig):
|
||||
// logger.debug('Photo hints regex match. Adding 15.')
|
||||
// img_score += 15
|
||||
//
|
||||
// # Look at our immediate sibling and see if it looks like it's a
|
||||
// # caption. Bonus if so.
|
||||
// sibling = img.getnext()
|
||||
// if sibling is not None:
|
||||
// if sibling.tag == 'figcaption':
|
||||
// img_score += 25
|
||||
//
|
||||
// sib_sig = ' '.join([sibling.get('id', ''),
|
||||
// sibling.get('class', '')]).lower()
|
||||
// if 'caption' in sib_sig:
|
||||
// img_score += 15
|
||||
//
|
||||
// # Pull out width/height if they were set.
|
||||
// img_width = None
|
||||
// img_height = None
|
||||
// if 'width' in img.attrib:
|
||||
// try:
|
||||
// img_width = float(img.get('width'))
|
||||
// except ValueError:
|
||||
// pass
|
||||
// if 'height' in img.attrib:
|
||||
// try:
|
||||
// img_height = float(img.get('height'))
|
||||
// except ValueError:
|
||||
// pass
|
||||
//
|
||||
// # Penalty for skinny images
|
||||
// if img_width and img_width <= 50:
|
||||
// logger.debug('Skinny image found. Subtracting 50.')
|
||||
// img_score -= 50
|
||||
//
|
||||
// # Penalty for short images
|
||||
// if img_height and img_height <= 50:
|
||||
// # Wide, short images are more common than narrow, tall ones
|
||||
// logger.debug('Short image found. Subtracting 25.')
|
||||
// img_score -= 25
|
||||
//
|
||||
// if img_width and img_height and not 'sprite' in img_path:
|
||||
// area = img_width * img_height
|
||||
//
|
||||
// if area < 5000: # Smaller than 50x100
|
||||
// logger.debug('Image with small area found. Subtracting 100.')
|
||||
// img_score -= 100
|
||||
// else:
|
||||
// img_score += round(area/1000.0)
|
||||
//
|
||||
// # If the image is higher on the page than other images,
|
||||
// # it gets a bonus. Penalty if lower.
|
||||
// logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
|
||||
// img_score += len(imgs)/2 - i
|
||||
//
|
||||
// # Use the raw src here because we munged img_path for case
|
||||
// # insensitivity
|
||||
// logger.debug('Final score is %d.', img_score)
|
||||
// img_scores[img.attrib['src']] += img_score
|
||||
//
|
||||
// top_score = 0
|
||||
// top_url = None
|
||||
// for (url, score) in img_scores.items():
|
||||
// if score > top_score:
|
||||
// top_url = url
|
||||
// top_score = score
|
||||
//
|
||||
// if top_score > 0:
|
||||
// logger.debug('Using top score image from content. Score was %d', top_score)
|
||||
// return top_url
|
||||
//
|
||||
//
|
||||
// # If nothing else worked, check to see if there are any really
|
||||
// # probable nodes in the doc, like <link rel="image_src" />.
|
||||
// logger.debug('Trying to find lead image in probable nodes')
|
||||
// for selector in constants.LEAD_IMAGE_URL_SELECTORS:
|
||||
// nodes = self.resource.extract_by_selector(selector)
|
||||
// for node in nodes:
|
||||
// clean_value = None
|
||||
// if node.attrib.get('src'):
|
||||
// clean_value = self.clean(node.attrib['src'])
|
||||
//
|
||||
// if not clean_value and node.attrib.get('href'):
|
||||
// clean_value = self.clean(node.attrib['href'])
|
||||
//
|
||||
// if not clean_value and node.attrib.get('value'):
|
||||
// clean_value = self.clean(node.attrib['value'])
|
||||
//
|
||||
// if clean_value:
|
||||
// logger.debug('Found lead image in probable nodes.')
|
||||
// logger.debug('Node was: %s', node)
|
||||
// return clean_value
|
||||
//
|
||||
// return None
|
@ -0,0 +1,54 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import HTML from './fixtures/html'
|
||||
|
||||
import GenericLeadImageUrlExtractor from './extractor'
|
||||
|
||||
describe('GenericLeadImageUrlExtractor', () => {
|
||||
describe('extract($, content, cachedMeta)', () => {
|
||||
it('returns og:image first', () => {
|
||||
const $ = cheerio.load(HTML.og.test)
|
||||
const content = $('*').first()
|
||||
const cachedMeta = ['og:image']
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
|
||||
|
||||
assert.equal(result, HTML.og.result)
|
||||
})
|
||||
|
||||
it('returns twitter:image', () => {
|
||||
const $ = cheerio.load(HTML.twitter.test)
|
||||
const content = $('*').first()
|
||||
const cachedMeta = ['twitter:image']
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
|
||||
|
||||
assert.equal(result, HTML.twitter.result)
|
||||
})
|
||||
|
||||
it('finds images based on scoring', () => {
|
||||
const $ = cheerio.load(HTML.scoring.test)
|
||||
const content = $('*').first()
|
||||
const cachedMeta = []
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
|
||||
|
||||
assert.equal(result, HTML.scoring.result)
|
||||
})
|
||||
|
||||
it('returns image based on selectors', () => {
|
||||
const $ = cheerio.load(HTML.selectors.test)
|
||||
const content = $('*').first()
|
||||
const cachedMeta = []
|
||||
|
||||
const result =
|
||||
GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
|
||||
|
||||
assert.equal(result, HTML.selectors.result)
|
||||
})
|
||||
})
|
||||
})
|
@ -0,0 +1,42 @@
|
||||
const HTML = {
|
||||
og: {
|
||||
test: `
|
||||
<html>
|
||||
<head>
|
||||
<meta property="og:image" content="http://example.com/lead.jpg">
|
||||
</head>
|
||||
</html>
|
||||
`,
|
||||
result: `http://example.com/lead.jpg`
|
||||
},
|
||||
twitter: {
|
||||
test: `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="twitter:image" content="http://example.com/lead.jpg">
|
||||
</head>
|
||||
</html>
|
||||
`,
|
||||
result: `http://example.com/lead.jpg`
|
||||
},
|
||||
scoring: {
|
||||
test: `
|
||||
<div>
|
||||
<img src="http://example.com/sprite/abadpic.jpg" />
|
||||
<img src="http://example.com/upload/goodpic.jpg" />
|
||||
<img src="http://example.com/upload/whateverpic.png" />
|
||||
</div>
|
||||
`,
|
||||
result: `http://example.com/upload/goodpic.jpg`
|
||||
},
|
||||
selectors: {
|
||||
test: `
|
||||
<div>
|
||||
<link rel="image_src" href="http://example.com/upload/goodpic.jpg">
|
||||
</div>
|
||||
`,
|
||||
result: `http://example.com/upload/goodpic.jpg`
|
||||
},
|
||||
}
|
||||
|
||||
export default HTML
|
@ -0,0 +1,125 @@
|
||||
import {
|
||||
POSITIVE_LEAD_IMAGE_URL_HINTS_RE,
|
||||
NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,
|
||||
GIF_RE,
|
||||
JPG_RE,
|
||||
} from './constants'
|
||||
|
||||
import { PHOTO_HINTS_RE } from '../content/utils/constants'
|
||||
|
||||
// Scores image urls based on a variety of heuristics.
|
||||
export function scoreImageUrl(url) {
|
||||
url = url.trim()
|
||||
let score = 0
|
||||
|
||||
if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
|
||||
score = score + 20
|
||||
}
|
||||
|
||||
if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
|
||||
score = score - 20
|
||||
}
|
||||
|
||||
// TODO: We might want to consider removing this as
|
||||
// gifs are much more common/popular than they once were
|
||||
if (GIF_RE.test(url)) {
|
||||
score = score - 10
|
||||
}
|
||||
|
||||
if (JPG_RE.test(url)) {
|
||||
score = score + 10
|
||||
}
|
||||
|
||||
// PNGs are neutral.
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// Alt attribute usually means non-presentational image.
|
||||
export function scoreAttr($img) {
|
||||
if ($img.attr('alt')) {
|
||||
return 5
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
// Look through our parent and grandparent for figure-like
|
||||
// container elements, give a bonus if we find them
|
||||
export function scoreByParents($img) {
|
||||
let score = 0
|
||||
const $figParent = $img.parents('figure').first()
|
||||
|
||||
if ($figParent.length === 1) {
|
||||
score = score + 25
|
||||
}
|
||||
|
||||
const $parent = $img.parent()
|
||||
let $gParent
|
||||
if ($parent.length === 1) {
|
||||
$gParent = $parent.parent()
|
||||
}
|
||||
|
||||
[$parent, $gParent].forEach($node => {
|
||||
if (PHOTO_HINTS_RE.test(getSig($node))) {
|
||||
score = score + 15
|
||||
}
|
||||
})
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
// Look at our immediate sibling and see if it looks like it's a
|
||||
// caption. Bonus if so.
|
||||
export function scoreBySibling($img) {
|
||||
let score = 0
|
||||
const $sibling = $img.next()
|
||||
const sibling = $sibling.get(0)
|
||||
|
||||
if (sibling && sibling.tagName === 'figcaption') {
|
||||
score = score + 25
|
||||
}
|
||||
|
||||
if (PHOTO_HINTS_RE.test(getSig($sibling))) {
|
||||
score = score + 15
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
export function scoreByDimensions($img) {
|
||||
let score = 0
|
||||
|
||||
const width = parseFloat($img.attr('width'))
|
||||
const height = parseFloat($img.attr('height'))
|
||||
const src = $img.attr('src')
|
||||
|
||||
// Penalty for skinny images
|
||||
if (width && width <= 50) {
|
||||
score = score - 50
|
||||
}
|
||||
|
||||
// Penalty for short images
|
||||
if (height && height <= 50) {
|
||||
score = score - 50
|
||||
}
|
||||
|
||||
if (width && height && !src.includes('sprite')) {
|
||||
const area = width * height
|
||||
if (area < 5000) { // Smaller than 50 x 100
|
||||
score = score - 100
|
||||
} else {
|
||||
score = score + Math.round(area/1000)
|
||||
}
|
||||
}
|
||||
|
||||
return score
|
||||
}
|
||||
|
||||
export function scoreByPosition($imgs, index) {
|
||||
return $imgs.length/2 - index
|
||||
}
|
||||
|
||||
function getSig($node) {
|
||||
return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`
|
||||
}
|
@ -0,0 +1,222 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import {
|
||||
scoreImageUrl,
|
||||
scoreAttr,
|
||||
scoreByParents,
|
||||
scoreBySibling,
|
||||
scoreByDimensions,
|
||||
scoreByPosition,
|
||||
} from './score-image'
|
||||
|
||||
describe('scoreImageUrlUrl(url)', () => {
|
||||
it('gets 20 points for a positive lead img hint', () => {
|
||||
const url = 'http://example.com/upload/img.png'
|
||||
|
||||
assert.equal(scoreImageUrl(url), 20)
|
||||
})
|
||||
|
||||
it('loses 20 points for a negative lead img hint', () => {
|
||||
const url = 'http://example.com/sprite/foo/bar.png'
|
||||
|
||||
assert.equal(scoreImageUrl(url), -20)
|
||||
})
|
||||
|
||||
it('loses 10 points for a gif', () => {
|
||||
const url = 'http://example.com/foo/bar.gif'
|
||||
|
||||
assert.equal(scoreImageUrl(url), -10)
|
||||
|
||||
const url2 = 'http://example.com/foogif/bar'
|
||||
|
||||
assert.equal(scoreImageUrl(url2), 0)
|
||||
})
|
||||
|
||||
it('gains 10 points for a jpg', () => {
|
||||
const url = 'http://example.com/foo/bar.jpg'
|
||||
assert.equal(scoreImageUrl(url), 10)
|
||||
|
||||
const url2 = 'http://example.com/foo/bar.jpeg'
|
||||
assert.equal(scoreImageUrl(url2), 10)
|
||||
|
||||
const url3 = 'http://example.com/foojpg/bar'
|
||||
assert.equal(scoreImageUrl(url3), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreAttr($img)', () => {
|
||||
it('gets 5 points if the img node has an alt attribute', () => {
|
||||
const $ = cheerio.load('<div><img alt="Wow" /></div>')
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreAttr($img), 5)
|
||||
})
|
||||
|
||||
it('gets 0 points if the img node has an alt attribute', () => {
|
||||
const $ = cheerio.load('<div><img /></div>')
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreAttr($img), 0)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreByParents($img)', () => {
|
||||
it('gets 25 points if it has a figure parent', () => {
|
||||
const $ = cheerio.load(
|
||||
`<div>
|
||||
<figure>
|
||||
<div>
|
||||
<img alt="Wow" />
|
||||
</div>
|
||||
</figure>
|
||||
</div>`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByParents($img), 25)
|
||||
})
|
||||
|
||||
it('gets 0 points if the img has no figure parent', () => {
|
||||
const $ = cheerio.load('<div><img /></div>')
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByParents($img), 0)
|
||||
})
|
||||
|
||||
it('gets 15 points if parent or gparent has photo hints', () => {
|
||||
const $ = cheerio.load(
|
||||
`<div>
|
||||
<div class="figure">
|
||||
<div>
|
||||
<img alt="Wow" />
|
||||
</div>
|
||||
</div>
|
||||
</div>`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByParents($img), 15)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreBySibling($img)', () => {
|
||||
it('gets 25 points if its sibling is figcaption', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img />
|
||||
<figcaption>Wow</figcaption>
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreBySibling($img), 25)
|
||||
})
|
||||
|
||||
it('gets 15 points if its sibling has photo hints', () => {
|
||||
const $ = cheerio.load(
|
||||
`<div>
|
||||
<div>
|
||||
<img alt="Wow" />
|
||||
<div class="caption">
|
||||
Wow
|
||||
</div>
|
||||
</div>
|
||||
</div>`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreBySibling($img), 15)
|
||||
})
|
||||
})
|
||||
|
||||
describe('scoreByDimensions($img)', () => {
|
||||
it('penalizes skinny images', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img width="10" />
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByDimensions($img), -50)
|
||||
})
|
||||
|
||||
it('penalizes short images', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img height="10" />
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByDimensions($img), -50)
|
||||
})
|
||||
|
||||
it('ignores sprites', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img src="/sprite/etc/foo.png" width="1000" height="1000" />
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByDimensions($img), 0)
|
||||
})
|
||||
|
||||
it('penalizes images with small areas', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img src="/etc/foo.png" width="60" height="60" />
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByDimensions($img), -100)
|
||||
})
|
||||
|
||||
it('prefers the largest images', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img src="/etc/foo.png" width="1000" height="1000" />
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $img = $('img').first()
|
||||
|
||||
assert.equal(scoreByDimensions($img), 1000)
|
||||
})
|
||||
|
||||
})
|
||||
|
||||
describe('scoreByPosition($imgs, index)', () => {
|
||||
it('gives higher scores to images that come first', () => {
|
||||
const $ = cheerio.load(
|
||||
`
|
||||
<div>
|
||||
<img width="10" />
|
||||
<img width="10" />
|
||||
<img width="10" />
|
||||
<img width="10" />
|
||||
<img width="10" />
|
||||
<img width="10" />
|
||||
</div>
|
||||
`
|
||||
)
|
||||
const $imgs = $('img')
|
||||
|
||||
assert.equal(scoreByPosition($imgs, 0), 3)
|
||||
})
|
||||
})
|
||||
|
Loading…
Reference in New Issue