chore: plumbing

8 years ago · 7e28871a02
parent 746d07d4a2
commit 7e28871a02
5 changed files with 49 additions and 14 deletions
--- a/src/extractor/generic/index.js
+++ b/src/extractor/generic/index.js
@ -4,15 +4,24 @@ import GenericContentExtractor from './content/extractor'
 import GenericTitleExtractor from './title/extractor'
 const GenericExtractor = {
-  parse: (html) => {
+  parse: (url, html) => {
-    let $ = cheerio.load(html)
+    if (html) {
      let $ = cheerio.load(html)
    } else {
      // TODO
      // Fetch link, following redirects
      // to return html and initialize $
    }
    // Cached value of every meta name in our document.
    // Used when extracting title/author/date_published/dek
-    const metaCache = $('meta').map((index, node) => $(node).attr('name'))
+    const metaCache = $('meta').map((index, node) => {
      return $(node).attr('name')
    }).toArray()
-    const title = GenericTitleExtractor.extract($, metaCache)
+    const title = GenericTitleExtractor.extract($, url, metaCache)
    return {
-      content: GenericContentExtractor.parse(html),
+      content: GenericContentExtractor.parse($, html),
      title: title,
    }
  }
--- a/src/extractor/generic/index.test.js
+++ b/src/extractor/generic/index.test.js
@ -0,0 +1,26 @@
 import assert from 'assert'
 import cheerio from 'cheerio'
 import fs from 'fs'
 import { clean } from './content/utils/dom/test-helpers'
 import GenericExtractor from './index'
 describe('GenericExtractor', () => {
  describe('parse(html)', () => {
    it("parses html and returns the article title", () => {
      const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
      const { title } = GenericExtractor.parse("http://latimes.com", html)
      assert.equal(title, 'California appears poised to be first to ban power-guzzling big-screen TVs')
    })
    it("parses html and returns the article title", () => {
      const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
      const { title } = GenericExtractor.parse("http://wired.com", html)
      assert.equal(title, 'Airplane Tires Don’t Explode on Landing Because They Are Pumped!')
    })
  })
 })
--- a/src/extractor/generic/title/extractor.js
+++ b/src/extractor/generic/title/extractor.js
@ -11,26 +11,26 @@ import {
 } from '../utils'
 const GenericTitleExtractor = {
-  extract($, cachedMeta) {
+  extract($, url, cachedMeta) {
    // First, check to see if we have a matching meta tag that we can make
    // use of that is strongly associated with the headline.
    let title
    title = extractFromMeta($, STRONG_TITLE_META_TAGS, cachedMeta)
-    if (title) return cleanTitle(title)
+    if (title) return cleanTitle(title, url, $)
    // Second, look through our content selectors for the most likely
    // article title that is strongly associated with the headline.
    title = extractFromSelectors($, STRONG_TITLE_SELECTORS)
-    if (title) return cleanTitle(title)
+    if (title) return cleanTitle(title, url, $)
    // Third, check for weaker meta tags that may match.
    title = extractFromMeta($, WEAK_TITLE_META_TAGS, cachedMeta)
-    if (title) return cleanTitle(title)
+    if (title) return cleanTitle(title, url, $)
    // Last, look for weaker selector tags that may match.
    title = extractFromSelectors($, WEAK_TITLE_SELECTORS)
-    if (title) return cleanTitle(title)
+    if (title) return cleanTitle(title, url, $)
    // If no matches, return an empty string
    return ""
--- a/src/extractor/generic/title/utils/clean-title.js
+++ b/src/extractor/generic/title/utils/clean-title.js
@ -2,11 +2,11 @@ import { TITLE_SPLITTERS_RE } from '../constants'
 import { resolveSplitTitle } from './index'
 import { stripTags } from '../../../utils'
-export default function cleanTitle(title, $) {
+export default function cleanTitle(title, url, $) {
  // If title has |, :, or - in it, see if
  // we can clean it up.
  if (TITLE_SPLITTERS_RE.test(title)) {
-    title = resolveSplitTitle(title)
+    title = resolveSplitTitle(title, url)
  }
  // Final sanity check that we didn't get a crazy title.
--- a/src/extractor/generic/title/utils/resolve-split-title.js
+++ b/src/extractor/generic/title/utils/resolve-split-title.js
@ -9,7 +9,7 @@ import {
 // Given a title with separators in it (colons, dashes, etc),
 // resolve whether any of the segments should be removed.
-export default function resolveSplitTitle(title, url='http://example.com') {
+export default function resolveSplitTitle(title, url='') {
  // Splits while preserving splitters, like:
  // ['The New New York', ' - ', 'The Washington Post']
  title = title
@ -94,7 +94,7 @@ function cleanDomainFromTitle(splitTitle, url) {
  const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '')
  const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain)
-  if (endSlugRatio > .4 && endSlug.length > 5) {
+  if (endSlugRatio > .4 && endSlug.length >= 5) {
    return splitTitle.slice(0, -2).join('')
  }
 }