feat: GenericExtractLeadImageUrl

Squashed commit of the following: commit 22d37ebf26dbbd0a3daebbfde3509a6ce04aaf72 Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 1 17:50:13 2016 -0400 feat: GenericExtractLeadImageUrl commit 3327a0a7929dd0e9267dc9c26f4e2aa78c32586f Author: Adam Pash <adam.pash@gmail.com> Date: Thu Sep 1 15:33:42 2016 -0400 feat: can pass custom attributes to extractFromMeta
8 years ago · 0ff3082295
parent 467b600721
commit 0ff3082295
17 changed files with 858 additions and 70 deletions
--- a/TODO.md
+++ b/TODO.md
@ -1,9 +1,8 @@
 TODO:
  Tmrw:
-    - extractDek
    - extractNextPageUrl
-    - extractLeadImageUrl
 - Try Closure webpack compiler
+- Rename all cleaners from cleanThing to clean
 - Make sure weightNodes flag is being passed properly
 - Get better sense of when cheerio returns a raw node and when a cheerio object
  - Remove $ from function calls to getScore
@ -13,6 +12,8 @@ TODO:


 DONE:
+x extractLeadImageUrl
+x extractDek
 x extractDatePublished
 x Title metadata
 x Test re-initializing $ if/when it needs to loop again
--- a/package.json
+++ b/package.json
@ -21,6 +21,7 @@
    "cheerio": "^0.20.0",
    "moment": "^2.14.1",
    "rollup": "^0.34.10",
+    "valid-url": "^1.0.9",
    "wuzzy": "^0.1.2"
  }
 }
--- a/src/extractor/generic/content/utils/constants.js
+++ b/src/extractor/generic/content/utils/constants.js
@ -1,26 +1,3 @@
-
-
-
-// An ordered list of meta tag names that denote likely article leading images.
-// All attributes should be lowercase for faster case-insensitive matching.
-// From most distinct to least distinct.
-export const LEAD_IMAGE_URL_META_TAGS = [
-    //'og:image',
-    'image_src',
-]
-
-// An ordered list of XPath Selectors to find likely article deks. From
-// most explicit to least explicit.
-//
-// Should be more restrictive than not, as a failed dek can be pretty
-// detrimental to the aesthetics of an article.
-export const LEAD_IMAGE_URL_SELECTORS = [
-    {
-        //selector: '//link[@rel="image_src"]',
-    }, // hentry microformat
-]
-
-
 //// CONTENT FETCHING CONSTANTS ////

 // A list of strings that can be considered unlikely candidates when
@ -188,43 +165,6 @@ export const PHOTO_HINTS = [
 ]
 export const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')

-export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
-    'upload',
-    'wp-content',
-    'large',
-    'photo',
-    'wp-image',
-]
-export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
-
-export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
-    'spacer',
-    'sprite',
-    'blank',
-    'throbber',
-    'gradient',
-    'tile',
-    'bg',
-    'background',
-    'icon',
-    'social',
-    'header',
-    'hdr',
-    'advert',
-    'spinner',
-    'loader',
-    'loading',
-    'default',
-    'rating',
-    'share',
-    'facebook',
-    'twitter',
-    'theme',
-    'promo',
-    'ads',
-    'wp-includes',
-]
-export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')

 // A list of strings that denote a positive scoring for this content as being
 // an article container. Checked against className and id.
--- a/src/extractor/generic/date-published/clean-date-published.js
+++ b/src/extractor/generic/date-published/clean-date-published.js
@ -1,4 +1,8 @@
 import moment from 'moment'
+// Is there a compelling reason to use moment here?
+// Mostly only being used for the isValid() method,
+// but could just check for 'Invalid Date' string.
+
 import {
  CLEAN_DATE_STRING_RE,
  TIME_MERIDIAN_RE
--- a/src/extractor/generic/index.js
+++ b/src/extractor/generic/index.js
@ -5,6 +5,7 @@ import GenericTitleExtractor from './title/extractor'
 import GenericAuthorExtractor from './author/extractor'
 import GenericDatePublishedExtractor from './date-published/extractor'
 import GenericDekExtractor from './dek/extractor'
+import GenericLeadImageUrlExtractor from './lead-image-url/extractor'

 const GenericExtractor = {
  parse: (url, html) => {
@ -28,6 +29,8 @@ const GenericExtractor = {
      GenericDatePublishedExtractor.extract($, url, metaCache)
    const author = GenericAuthorExtractor.extract($, metaCache)
    const content = GenericContentExtractor.parse($, html)
+    const leadImageUrl =
+      GenericLeadImageUrlExtractor.extract($, content, metaCache)
    const dek = GenericDekExtractor.extract($, metaCache, content)

    return {
@ -35,6 +38,7 @@ const GenericExtractor = {
      author,
      datePublished,
      dek,
+      leadImageUrl,
      content,
    }
  }
--- a/src/extractor/generic/index.test.js
+++ b/src/extractor/generic/index.test.js
@ -15,6 +15,7 @@ describe('GenericExtractor', () => {
        author,
        datePublished,
        dek,
+        leadImageUrl,
      } = GenericExtractor.parse("http://latimes.com", html)

      assert.equal(author, null)
@ -27,6 +28,7 @@ describe('GenericExtractor', () => {
        '2009-10-14T04:00:00.000Z'
      )
      assert.equal(dek, null)
+      assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg')
    })

    it("parses html and returns the article title", () => {
@ -37,6 +39,7 @@ describe('GenericExtractor', () => {
        title,
        datePublished,
        dek,
+        leadImageUrl,
      } = GenericExtractor.parse("http://wired.com", html)

      assert.equal(author, 'Eric Adams')
@ -46,6 +49,7 @@ describe('GenericExtractor', () => {
      )
      assert.equal(datePublished, null)
      assert.equal(dek, null)
+      assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg')
    })

  })
--- a/src/extractor/generic/lead-image-url/clean.js
+++ b/src/extractor/generic/lead-image-url/clean.js
@ -0,0 +1,10 @@
+import validUrl from 'valid-url'
+
+export default function clean(leadImageUrl) {
+  leadImageUrl = leadImageUrl.trim()
+  if (validUrl.isWebUri(leadImageUrl)) {
+    return leadImageUrl
+  } else {
+    return null
+  }
+}
--- a/src/extractor/generic/lead-image-url/clean.test.js
+++ b/src/extractor/generic/lead-image-url/clean.test.js
@ -0,0 +1,20 @@
+import assert from 'assert'
+
+import clean from './clean'
+
+describe('clean(leadImageUrl)', () => {
+  it('returns the url if valid', () => {
+    const url = 'https://example.com'
+    assert.equal(clean(url), url)
+  })
+
+  it('returns null if the url is not valid', () => {
+    const url = 'this is not a valid url'
+    assert.equal(clean(url), null)
+  })
+
+  it('trims whitespace', () => {
+    const url = '  https://example.com/foo/bar.jpg'
+    assert.equal(clean(url), url.trim())
+  })
+})
--- a/src/extractor/generic/lead-image-url/constants.js
+++ b/src/extractor/generic/lead-image-url/constants.js
@ -0,0 +1,53 @@
+// An ordered list of meta tag names that denote likely article leading images.
+// All attributes should be lowercase for faster case-insensitive matching.
+// From most distinct to least distinct.
+export const LEAD_IMAGE_URL_META_TAGS = [
+    ['og:image', 'property', 'content'],
+    ['twitter:image', 'name', 'content'],
+    'image_src',
+]
+
+export const LEAD_IMAGE_URL_SELECTORS = [
+  'link[rel=image_src]',
+]
+
+export const POSITIVE_LEAD_IMAGE_URL_HINTS = [
+    'upload',
+    'wp-content',
+    'large',
+    'photo',
+    'wp-image',
+]
+export const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
+
+export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
+    'spacer',
+    'sprite',
+    'blank',
+    'throbber',
+    'gradient',
+    'tile',
+    'bg',
+    'background',
+    'icon',
+    'social',
+    'header',
+    'hdr',
+    'advert',
+    'spinner',
+    'loader',
+    'loading',
+    'default',
+    'rating',
+    'share',
+    'facebook',
+    'twitter',
+    'theme',
+    'promo',
+    'ads',
+    'wp-includes',
+]
+export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
+
+export const GIF_RE = /\gif$/i
+export const JPG_RE = /\jpe?g$/i
--- a/src/extractor/generic/lead-image-url/extractor.js
+++ b/src/extractor/generic/lead-image-url/extractor.js
@ -0,0 +1,268 @@
+import 'babel-polyfill'
+
+import {
+  LEAD_IMAGE_URL_META_TAGS,
+  LEAD_IMAGE_URL_SELECTORS,
+} from './constants'
+
+import {
+  extractFromMeta,
+  extractFromSelectors
+} from '../utils'
+
+import {
+  scoreImageUrl,
+  scoreAttr,
+  scoreByParents,
+  scoreBySibling,
+  scoreByDimensions,
+  scoreByPosition,
+} from './score-image'
+
+import clean from './clean'
+
+// Given a resource, try to find the lead image URL from within
+// it. Like content and next page extraction, uses a scoring system
+// to determine what the most likely image may be. Short circuits
+// on really probable things like og:image meta tags.
+//
+// Potential signals to still take advantage of:
+//   * domain
+//   * weird aspect ratio
+const GenericLeadImageUrlExtractor = {
+  extract($, content, cachedMeta) {
+    let imageUrl, cleanUrl
+
+    // Check to see if we have a matching meta tag that we can make use of.
+    // Moving this higher because common practice is now to use large
+    // images on things like Open Graph or Twitter cards.
+    // images usually have for things like Open Graph.
+    imageUrl =
+      extractFromMeta(
+        $,
+        LEAD_IMAGE_URL_META_TAGS,
+        cachedMeta,
+        false
+      )
+
+    if (imageUrl) {
+      cleanUrl = clean(imageUrl)
+
+      if (cleanUrl) return cleanUrl
+    }
+
+    // Next, try to find the "best" image via the content.
+    // We'd rather not have to fetch each image and check dimensions,
+    // so try to do some analysis and determine them instead.
+    const imgs = $('img', content).toArray()
+    let imgScores = {}
+
+    imgs.forEach((img, index) => {
+      const $img = $(img)
+      const src = $img.attr('src')
+
+      if (!src) return
+
+      let score = scoreImageUrl(src)
+      score = score + scoreAttr($img)
+      score = score + scoreByParents($img)
+      score = score + scoreBySibling($img)
+      score = score + scoreByDimensions($img)
+      score = score + scoreByPosition(imgs, index)
+
+      imgScores[src] = score
+    })
+
+    const [topUrl, topScore] =
+      Reflect.ownKeys(imgScores).reduce((acc, key) =>
+        imgScores[key] > acc[1] ? [key, imgScores[key]] : acc
+      , [null, 0])
+
+    if (topScore > 0) {
+      cleanUrl = clean(topUrl)
+
+      if (cleanUrl) return cleanUrl
+    }
+
+    // If nothing else worked, check to see if there are any really
+    // probable nodes in the doc, like <link rel="image_src" />.
+    for (const selector of LEAD_IMAGE_URL_SELECTORS) {
+      const $node = $(selector).first()
+      const src = $node.attr('src')
+      if (src) {
+        cleanUrl = clean(src)
+        if (cleanUrl) return cleanUrl
+      }
+
+      const href = $node.attr('href')
+      if (href) {
+        cleanUrl = clean(href)
+        if (cleanUrl) return cleanUrl
+      }
+
+      const value = $node.attr('value')
+      if (value) {
+        cleanUrl = clean(value)
+        if (cleanUrl) return cleanUrl
+      }
+    }
+
+  },
+}
+
+export default GenericLeadImageUrlExtractor
+
+// def extract(self):
+//     """
+//     # First, try to find the "best" image via the content.
+//     # We'd rather not have to fetch each image and check dimensions,
+//     # so try to do some analysis and determine them instead.
+//     content = self.extractor.extract_content(return_type="node")
+//     imgs = content.xpath('.//img')
+//     img_scores = defaultdict(int)
+//     logger.debug('Scoring %d images from content', len(imgs))
+//     for (i, img) in enumerate(imgs):
+//         img_score = 0
+//
+//         if not 'src' in img.attrib:
+//             logger.debug('No src attribute found')
+//             continue
+//
+//         try:
+//             parsed_img = urlparse(img.attrib['src'])
+//             img_path = parsed_img.path.lower()
+//         except ValueError:
+//             logger.debug('ValueError getting img path.')
+//             continue
+//         logger.debug('Image path is %s', img_path)
+//
+//         if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
+//             logger.debug('Positive URL hints match. Adding 20.')
+//             img_score += 20
+//
+//         if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
+//             logger.debug('Negative URL hints match. Subtracting 20.')
+//             img_score -= 20
+//
+//         # Gifs are more often structure than photos
+//         if img_path.endswith('gif'):
+//             logger.debug('gif found. Subtracting 10.')
+//             img_score -= 10
+//
+//         # JPGs are more often photographs
+//         if img_path.endswith('jpg'):
+//             logger.debug('jpg found. Adding 10.')
+//             img_score += 10
+//
+//         # PNGs are neutral.
+//
+//         # Alt attribute usually means non-presentational image.
+//         if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
+//             logger.debug('alt attribute found. Adding 5.')
+//             img_score += 5
+//
+//         # Look through our parent and grandparent for figure-like
+//         # container elements, give a bonus if we find them
+//         parents = [img.getparent()]
+//         if parents[0] is not None and parents[0].getparent() is not None:
+//             parents.append(parents[0].getparent())
+//         for p in parents:
+//             if p.tag == 'figure':
+//                 logger.debug('Parent with <figure> tag found. Adding 25.')
+//                 img_score += 25
+//
+//             p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
+//             if constants.PHOTO_HINTS_RE.search(p_sig):
+//                 logger.debug('Photo hints regex match. Adding 15.')
+//                 img_score += 15
+//
+//         # Look at our immediate sibling and see if it looks like it's a
+//         # caption. Bonus if so.
+//         sibling = img.getnext()
+//         if sibling is not None:
+//             if sibling.tag == 'figcaption':
+//                 img_score += 25
+//     
+//             sib_sig = ' '.join([sibling.get('id', ''),
+//                                 sibling.get('class', '')]).lower()
+//             if 'caption' in sib_sig:
+//                 img_score += 15
+//
+//         # Pull out width/height if they were set.
+//         img_width = None
+//         img_height = None
+//         if 'width' in img.attrib:
+//             try:
+//                 img_width = float(img.get('width'))
+//             except ValueError:
+//                 pass
+//         if 'height' in img.attrib:
+//             try:
+//                 img_height = float(img.get('height'))
+//             except ValueError:
+//                 pass
+//
+//         # Penalty for skinny images
+//         if img_width and img_width <= 50:
+//             logger.debug('Skinny image found. Subtracting 50.')
+//             img_score -= 50
+//
+//         # Penalty for short images
+//         if img_height and img_height <= 50:
+//             # Wide, short images are more common than narrow, tall ones
+//             logger.debug('Short image found. Subtracting 25.')
+//             img_score -= 25
+//
+//         if img_width and img_height and not 'sprite' in img_path:
+//             area = img_width * img_height
+//     
+//             if area < 5000: # Smaller than 50x100
+//                 logger.debug('Image with small area found. Subtracting 100.')
+//                 img_score -= 100
+//             else:
+//                 img_score += round(area/1000.0)
+//
+//         # If the image is higher on the page than other images,
+//         # it gets a bonus. Penalty if lower.
+//         logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
+//         img_score += len(imgs)/2 - i
+//
+//         # Use the raw src here because we munged img_path for case
+//         # insensitivity
+//         logger.debug('Final score is %d.', img_score)
+//         img_scores[img.attrib['src']] += img_score
+//
+//     top_score = 0
+//     top_url = None
+//     for (url, score) in img_scores.items():
+//         if score > top_score:
+//             top_url = url
+//             top_score = score
+//
+//     if top_score > 0:
+//         logger.debug('Using top score image from content. Score was %d', top_score)
+//         return top_url
+//
+//
+//     # If nothing else worked, check to see if there are any really
+//     # probable nodes in the doc, like <link rel="image_src" />.
+//     logger.debug('Trying to find lead image in probable nodes')
+//     for selector in constants.LEAD_IMAGE_URL_SELECTORS:
+//         nodes = self.resource.extract_by_selector(selector)
+//         for node in nodes:
+//             clean_value = None
+//             if node.attrib.get('src'):
+//                 clean_value = self.clean(node.attrib['src'])
+//
+//             if not clean_value and node.attrib.get('href'):
+//                 clean_value = self.clean(node.attrib['href'])
+//
+//             if not clean_value and node.attrib.get('value'):
+//                 clean_value = self.clean(node.attrib['value'])
+//
+//             if clean_value:
+//                 logger.debug('Found lead image in probable nodes.')
+//                 logger.debug('Node was: %s', node)
+//                 return clean_value
+//
+//     return None
--- a/src/extractor/generic/lead-image-url/extractor.test.js
+++ b/src/extractor/generic/lead-image-url/extractor.test.js
@ -0,0 +1,54 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import HTML from './fixtures/html'
+
+import GenericLeadImageUrlExtractor from './extractor'
+
+describe('GenericLeadImageUrlExtractor', () => {
+  describe('extract($, content, cachedMeta)', () => {
+    it('returns og:image first', () => {
+      const $ = cheerio.load(HTML.og.test)
+      const content = $('*').first()
+      const cachedMeta = ['og:image']
+
+      const result =
+        GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
+
+      assert.equal(result, HTML.og.result)
+    })
+
+    it('returns twitter:image', () => {
+      const $ = cheerio.load(HTML.twitter.test)
+      const content = $('*').first()
+      const cachedMeta = ['twitter:image']
+
+      const result =
+        GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
+
+      assert.equal(result, HTML.twitter.result)
+    })
+
+    it('finds images based on scoring', () => {
+      const $ = cheerio.load(HTML.scoring.test)
+      const content = $('*').first()
+      const cachedMeta = []
+
+      const result =
+        GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
+
+      assert.equal(result, HTML.scoring.result)
+    })
+
+    it('returns image based on selectors', () => {
+      const $ = cheerio.load(HTML.selectors.test)
+      const content = $('*').first()
+      const cachedMeta = []
+
+      const result =
+        GenericLeadImageUrlExtractor.extract($, content, cachedMeta)
+
+      assert.equal(result, HTML.selectors.result)
+    })
+  })
+})
--- a/src/extractor/generic/lead-image-url/fixtures/html.js
+++ b/src/extractor/generic/lead-image-url/fixtures/html.js
@ -0,0 +1,42 @@
+const HTML = {
+  og: {
+    test: `
+      <html>
+        <head>
+          <meta property="og:image" content="http://example.com/lead.jpg">
+        </head>
+      </html>
+    `,
+    result: `http://example.com/lead.jpg`
+  },
+  twitter: {
+    test: `
+      <html>
+        <head>
+          <meta name="twitter:image" content="http://example.com/lead.jpg">
+        </head>
+      </html>
+    `,
+    result: `http://example.com/lead.jpg`
+  },
+  scoring: {
+    test: `
+      <div>
+        <img src="http://example.com/sprite/abadpic.jpg" />
+        <img src="http://example.com/upload/goodpic.jpg" />
+        <img src="http://example.com/upload/whateverpic.png" />
+      </div>
+    `,
+    result: `http://example.com/upload/goodpic.jpg`
+  },
+  selectors: {
+    test: `
+      <div>
+        <link rel="image_src" href="http://example.com/upload/goodpic.jpg">
+      </div>
+    `,
+    result: `http://example.com/upload/goodpic.jpg`
+  },
+}
+
+export default HTML
--- a/src/extractor/generic/lead-image-url/score-image.js
+++ b/src/extractor/generic/lead-image-url/score-image.js
@ -0,0 +1,125 @@
+import {
+  POSITIVE_LEAD_IMAGE_URL_HINTS_RE,
+  NEGATIVE_LEAD_IMAGE_URL_HINTS_RE,
+  GIF_RE,
+  JPG_RE,
+} from './constants'
+
+import { PHOTO_HINTS_RE } from '../content/utils/constants'
+
+// Scores image urls based on a variety of heuristics.
+export function scoreImageUrl(url) {
+  url = url.trim()
+  let score = 0
+
+  if (POSITIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
+    score = score + 20
+  }
+
+  if (NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.test(url)) {
+    score = score - 20
+  }
+
+  // TODO: We might want to consider removing this as
+  // gifs are much more common/popular than they once were
+  if (GIF_RE.test(url)) {
+    score = score - 10
+  }
+
+  if (JPG_RE.test(url)) {
+    score = score + 10
+  }
+
+  // PNGs are neutral.
+
+  return score
+}
+
+// Alt attribute usually means non-presentational image.
+export function scoreAttr($img) {
+  if ($img.attr('alt')) {
+    return 5
+  } else {
+    return 0
+  }
+}
+
+// Look through our parent and grandparent for figure-like
+// container elements, give a bonus if we find them
+export function scoreByParents($img) {
+  let score = 0
+  const $figParent = $img.parents('figure').first()
+
+  if ($figParent.length === 1) {
+    score = score + 25
+  }
+
+  const $parent = $img.parent()
+  let $gParent
+  if ($parent.length === 1) {
+    $gParent = $parent.parent()
+  }
+
+  [$parent, $gParent].forEach($node => {
+    if (PHOTO_HINTS_RE.test(getSig($node))) {
+      score = score + 15
+    }
+  })
+
+  return score
+}
+
+// Look at our immediate sibling and see if it looks like it's a
+// caption. Bonus if so.
+export function scoreBySibling($img) {
+  let score = 0
+  const $sibling = $img.next()
+  const sibling = $sibling.get(0)
+
+  if (sibling && sibling.tagName === 'figcaption') {
+    score = score + 25
+  }
+
+  if (PHOTO_HINTS_RE.test(getSig($sibling))) {
+    score = score + 15
+  }
+
+  return score
+}
+
+export function scoreByDimensions($img) {
+  let score = 0
+
+  const width = parseFloat($img.attr('width'))
+  const height = parseFloat($img.attr('height'))
+  const src = $img.attr('src')
+
+  // Penalty for skinny images
+  if (width && width <= 50) {
+    score = score - 50
+  }
+
+  // Penalty for short images
+  if (height && height <= 50) {
+    score = score - 50
+  }
+
+  if (width && height && !src.includes('sprite')) {
+    const area = width * height
+    if (area < 5000) { // Smaller than 50 x 100
+      score = score - 100
+    } else {
+      score = score + Math.round(area/1000)
+    }
+  }
+
+  return score
+}
+
+export function scoreByPosition($imgs, index) {
+  return $imgs.length/2 - index
+}
+
+function getSig($node) {
+  return `${$node.attr('class') || ''} ${$node.attr('id') || ''}`
+}
--- a/src/extractor/generic/lead-image-url/score-image.test.js
+++ b/src/extractor/generic/lead-image-url/score-image.test.js
@ -0,0 +1,222 @@
+import assert from 'assert'
+import cheerio from 'cheerio'
+
+import {
+  scoreImageUrl,
+  scoreAttr,
+  scoreByParents,
+  scoreBySibling,
+  scoreByDimensions,
+  scoreByPosition,
+} from './score-image'
+
+describe('scoreImageUrlUrl(url)', () => {
+  it('gets 20 points for a positive lead img hint', () => {
+    const url = 'http://example.com/upload/img.png'
+
+    assert.equal(scoreImageUrl(url), 20)
+  })
+
+  it('loses 20 points for a negative lead img hint', () => {
+    const url = 'http://example.com/sprite/foo/bar.png'
+
+    assert.equal(scoreImageUrl(url), -20)
+  })
+
+  it('loses 10 points for a gif', () => {
+    const url = 'http://example.com/foo/bar.gif'
+
+    assert.equal(scoreImageUrl(url), -10)
+
+    const url2 = 'http://example.com/foogif/bar'
+
+    assert.equal(scoreImageUrl(url2), 0)
+  })
+
+  it('gains 10 points for a jpg', () => {
+    const url = 'http://example.com/foo/bar.jpg'
+    assert.equal(scoreImageUrl(url), 10)
+
+    const url2 = 'http://example.com/foo/bar.jpeg'
+    assert.equal(scoreImageUrl(url2), 10)
+
+    const url3 = 'http://example.com/foojpg/bar'
+    assert.equal(scoreImageUrl(url3), 0)
+  })
+})
+
+describe('scoreAttr($img)', () => {
+  it('gets 5 points if the img node has an alt attribute', () => {
+    const $ = cheerio.load('<div><img alt="Wow" /></div>')
+    const $img = $('img').first()
+
+    assert.equal(scoreAttr($img), 5)
+  })
+
+  it('gets 0 points if the img node has an alt attribute', () => {
+    const $ = cheerio.load('<div><img /></div>')
+    const $img = $('img').first()
+
+    assert.equal(scoreAttr($img), 0)
+  })
+})
+
+describe('scoreByParents($img)', () => {
+  it('gets 25 points if it has a figure parent', () => {
+    const $ = cheerio.load(
+      `<div>
+          <figure>
+            <div>
+              <img alt="Wow" />
+            </div>
+          </figure>
+        </div>`
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByParents($img), 25)
+  })
+
+  it('gets 0 points if the img has no figure parent', () => {
+    const $ = cheerio.load('<div><img /></div>')
+    const $img = $('img').first()
+
+    assert.equal(scoreByParents($img), 0)
+  })
+
+  it('gets 15 points if parent or gparent has photo hints', () => {
+    const $ = cheerio.load(
+      `<div>
+          <div class="figure">
+            <div>
+              <img alt="Wow" />
+            </div>
+          </div>
+        </div>`
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByParents($img), 15)
+  })
+})
+
+describe('scoreBySibling($img)', () => {
+  it('gets 25 points if its sibling is figcaption', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img />
+        <figcaption>Wow</figcaption>
+      </div>
+      `
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreBySibling($img), 25)
+  })
+
+  it('gets 15 points if its sibling has photo hints', () => {
+    const $ = cheerio.load(
+      `<div>
+          <div>
+              <img alt="Wow" />
+              <div class="caption">
+                Wow
+              </div>
+          </div>
+        </div>`
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreBySibling($img), 15)
+  })
+})
+
+describe('scoreByDimensions($img)', () => {
+  it('penalizes skinny images', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img width="10" />
+      </div>
+      `
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByDimensions($img), -50)
+  })
+
+  it('penalizes short images', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img height="10" />
+      </div>
+      `
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByDimensions($img), -50)
+  })
+
+  it('ignores sprites', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img src="/sprite/etc/foo.png" width="1000" height="1000" />
+      </div>
+      `
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByDimensions($img), 0)
+  })
+
+  it('penalizes images with small areas', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img src="/etc/foo.png" width="60" height="60" />
+      </div>
+      `
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByDimensions($img), -100)
+  })
+
+  it('prefers the largest images', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img src="/etc/foo.png" width="1000" height="1000" />
+      </div>
+      `
+    )
+    const $img = $('img').first()
+
+    assert.equal(scoreByDimensions($img), 1000)
+  })
+
+})
+
+describe('scoreByPosition($imgs, index)', () => {
+  it('gives higher scores to images that come first', () => {
+    const $ = cheerio.load(
+      `
+      <div>
+        <img width="10" />
+        <img width="10" />
+        <img width="10" />
+        <img width="10" />
+        <img width="10" />
+        <img width="10" />
+      </div>
+      `
+    )
+    const $imgs = $('img')
+
+    assert.equal(scoreByPosition($imgs, 0), 3)
+  })
+})
+
--- a/src/extractor/generic/utils/extract-from-meta.js
+++ b/src/extractor/generic/utils/extract-from-meta.js
@ -2,25 +2,47 @@ import { stripTags } from '../../utils'

 // Given a node type to search for, and a list of meta tag names to
 // search for, find a meta tag associated.
+// metaNames can be an array of strings of an array of three-element
+// arrays that will define the attributes to select from the meta
+// elements. E.g., ['og:image', 'property', 'content'] will search
+// $('meta[property=og:image]').attr('content').
+//
+// Default is $('meta[name=og:image]').attr(value)
 export default function extractFromMeta(
  $,
  metaNames,
  cachedNames,
-  cleanTags=true
+  cleanTags=true,
 ) {
-  const foundNames = metaNames.filter(name =>
-                                      cachedNames.indexOf(name) !== -1
-                                     )
-  let metaValue
-  for (const name of foundNames) {
-    const nodes = $(`meta[name="${name}"]`)
+  const foundNames = metaNames.filter(name => {
+    const metaType = typeof name

+    if (metaType === 'string') {
+      return cachedNames.indexOf(name) !== -1
+    } else if (metaType === 'object') {
+      return cachedNames.indexOf(name[0]) !== 1
+    }
+  })
+
+  for (let name of foundNames) {
+    let type, value
+
+    if (typeof name === 'string') {
+      type = 'name'
+      value = 'value'
+    } else {
+      type = name[1]
+      value = name[2]
+      name = name[0]
+    }
+
+    const nodes = $(`meta[${type}="${name}"]`)

    // Get the unique value of every matching node, in case there
    // are two meta tags with the same name and value.
    // Remove empty values.
    const values =
-      $(`meta[name="${name}"]`).map((index, node) => $(node).attr('value'))
+      nodes.map((index, node) => $(node).attr(value))
                               .toArray()
                               .filter(text => text !== '')

@ -32,6 +54,7 @@ export default function extractFromMeta(
      continue
    }

+    let metaValue
    // Meta values that contain HTML should be stripped, as they
    // weren't subject to cleaning previously.
    if (cleanTags) {
--- a/src/extractor/generic/utils/extract-from-meta.test.js
+++ b/src/extractor/generic/utils/extract-from-meta.test.js
@ -32,6 +32,16 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => {
    assert.equal(result, HTML.metaEmptyDupes.result)
  })

+  it('accepts custom attributes', () => {
+    const $ = cheerio.load(HTML.custom.test)
+    const metaNames = [['foo', 'property', 'content']]
+    const cachedNames = ['foo']
+    const result = extractFromMeta(
+      $, metaNames, cachedNames
+    )
+
+    assert.equal(result, HTML.custom.result)
+  })
 })


--- a/src/extractor/generic/utils/fixtures/html.js
+++ b/src/extractor/generic/utils/fixtures/html.js
@ -23,6 +23,13 @@ const HTML = {
      </html>`,
    result: `bar`,
  },
+  custom: {
+    test: `
+      <html>
+        <meta property="foo" content="bar" />
+      </html>`,
+    result: `bar`,
+  },

  // extractFromSelectors
  simpleSelector: {