chore: plumbing

pull/1/head
Adam Pash 8 years ago
parent 746d07d4a2
commit 7e28871a02

@ -4,15 +4,24 @@ import GenericContentExtractor from './content/extractor'
import GenericTitleExtractor from './title/extractor'
const GenericExtractor = {
parse: (html) => {
let $ = cheerio.load(html)
parse: (url, html) => {
if (html) {
let $ = cheerio.load(html)
} else {
// TODO
// Fetch link, following redirects
// to return html and initialize $
}
// Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek
const metaCache = $('meta').map((index, node) => $(node).attr('name'))
const metaCache = $('meta').map((index, node) => {
return $(node).attr('name')
}).toArray()
const title = GenericTitleExtractor.extract($, metaCache)
const title = GenericTitleExtractor.extract($, url, metaCache)
return {
content: GenericContentExtractor.parse(html),
content: GenericContentExtractor.parse($, html),
title: title,
}
}

@ -0,0 +1,26 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import { clean } from './content/utils/dom/test-helpers'
import GenericExtractor from './index'
describe('GenericExtractor', () => {
describe('parse(html)', () => {
it("parses html and returns the article title", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
const { title } = GenericExtractor.parse("http://latimes.com", html)
assert.equal(title, 'California appears poised to be first to ban power-guzzling big-screen TVs')
})
it("parses html and returns the article title", () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
const { title } = GenericExtractor.parse("http://wired.com", html)
assert.equal(title, 'Airplane Tires Dont Explode on Landing Because They Are Pumped!')
})
})
})

@ -11,26 +11,26 @@ import {
} from '../utils'
const GenericTitleExtractor = {
extract($, cachedMeta) {
extract($, url, cachedMeta) {
// First, check to see if we have a matching meta tag that we can make
// use of that is strongly associated with the headline.
let title
title = extractFromMeta($, STRONG_TITLE_META_TAGS, cachedMeta)
if (title) return cleanTitle(title)
if (title) return cleanTitle(title, url, $)
// Second, look through our content selectors for the most likely
// article title that is strongly associated with the headline.
title = extractFromSelectors($, STRONG_TITLE_SELECTORS)
if (title) return cleanTitle(title)
if (title) return cleanTitle(title, url, $)
// Third, check for weaker meta tags that may match.
title = extractFromMeta($, WEAK_TITLE_META_TAGS, cachedMeta)
if (title) return cleanTitle(title)
if (title) return cleanTitle(title, url, $)
// Last, look for weaker selector tags that may match.
title = extractFromSelectors($, WEAK_TITLE_SELECTORS)
if (title) return cleanTitle(title)
if (title) return cleanTitle(title, url, $)
// If no matches, return an empty string
return ""

@ -2,11 +2,11 @@ import { TITLE_SPLITTERS_RE } from '../constants'
import { resolveSplitTitle } from './index'
import { stripTags } from '../../../utils'
export default function cleanTitle(title, $) {
export default function cleanTitle(title, url, $) {
// If title has |, :, or - in it, see if
// we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) {
title = resolveSplitTitle(title)
title = resolveSplitTitle(title, url)
}
// Final sanity check that we didn't get a crazy title.

@ -9,7 +9,7 @@ import {
// Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url='http://example.com') {
export default function resolveSplitTitle(title, url='') {
// Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post']
title = title
@ -94,7 +94,7 @@ function cleanDomainFromTitle(splitTitle, url) {
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '')
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain)
if (endSlugRatio > .4 && endSlug.length > 5) {
if (endSlugRatio > .4 && endSlug.length >= 5) {
return splitTitle.slice(0, -2).join('')
}
}

Loading…
Cancel
Save