chore: plumbing

pull/1/head
Adam Pash 8 years ago
parent 746d07d4a2
commit 7e28871a02

@ -4,15 +4,24 @@ import GenericContentExtractor from './content/extractor'
import GenericTitleExtractor from './title/extractor' import GenericTitleExtractor from './title/extractor'
const GenericExtractor = { const GenericExtractor = {
parse: (html) => { parse: (url, html) => {
let $ = cheerio.load(html) if (html) {
let $ = cheerio.load(html)
} else {
// TODO
// Fetch link, following redirects
// to return html and initialize $
}
// Cached value of every meta name in our document. // Cached value of every meta name in our document.
// Used when extracting title/author/date_published/dek // Used when extracting title/author/date_published/dek
const metaCache = $('meta').map((index, node) => $(node).attr('name')) const metaCache = $('meta').map((index, node) => {
return $(node).attr('name')
}).toArray()
const title = GenericTitleExtractor.extract($, metaCache) const title = GenericTitleExtractor.extract($, url, metaCache)
return { return {
content: GenericContentExtractor.parse(html), content: GenericContentExtractor.parse($, html),
title: title, title: title,
} }
} }

@ -0,0 +1,26 @@
import assert from 'assert'
import cheerio from 'cheerio'
import fs from 'fs'
import { clean } from './content/utils/dom/test-helpers'
import GenericExtractor from './index'
describe('GenericExtractor', () => {
describe('parse(html)', () => {
it("parses html and returns the article title", () => {
const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8')
const { title } = GenericExtractor.parse("http://latimes.com", html)
assert.equal(title, 'California appears poised to be first to ban power-guzzling big-screen TVs')
})
it("parses html and returns the article title", () => {
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
const { title } = GenericExtractor.parse("http://wired.com", html)
assert.equal(title, 'Airplane Tires Dont Explode on Landing Because They Are Pumped!')
})
})
})

@ -11,26 +11,26 @@ import {
} from '../utils' } from '../utils'
const GenericTitleExtractor = { const GenericTitleExtractor = {
extract($, cachedMeta) { extract($, url, cachedMeta) {
// First, check to see if we have a matching meta tag that we can make // First, check to see if we have a matching meta tag that we can make
// use of that is strongly associated with the headline. // use of that is strongly associated with the headline.
let title let title
title = extractFromMeta($, STRONG_TITLE_META_TAGS, cachedMeta) title = extractFromMeta($, STRONG_TITLE_META_TAGS, cachedMeta)
if (title) return cleanTitle(title) if (title) return cleanTitle(title, url, $)
// Second, look through our content selectors for the most likely // Second, look through our content selectors for the most likely
// article title that is strongly associated with the headline. // article title that is strongly associated with the headline.
title = extractFromSelectors($, STRONG_TITLE_SELECTORS) title = extractFromSelectors($, STRONG_TITLE_SELECTORS)
if (title) return cleanTitle(title) if (title) return cleanTitle(title, url, $)
// Third, check for weaker meta tags that may match. // Third, check for weaker meta tags that may match.
title = extractFromMeta($, WEAK_TITLE_META_TAGS, cachedMeta) title = extractFromMeta($, WEAK_TITLE_META_TAGS, cachedMeta)
if (title) return cleanTitle(title) if (title) return cleanTitle(title, url, $)
// Last, look for weaker selector tags that may match. // Last, look for weaker selector tags that may match.
title = extractFromSelectors($, WEAK_TITLE_SELECTORS) title = extractFromSelectors($, WEAK_TITLE_SELECTORS)
if (title) return cleanTitle(title) if (title) return cleanTitle(title, url, $)
// If no matches, return an empty string // If no matches, return an empty string
return "" return ""

@ -2,11 +2,11 @@ import { TITLE_SPLITTERS_RE } from '../constants'
import { resolveSplitTitle } from './index' import { resolveSplitTitle } from './index'
import { stripTags } from '../../../utils' import { stripTags } from '../../../utils'
export default function cleanTitle(title, $) { export default function cleanTitle(title, url, $) {
// If title has |, :, or - in it, see if // If title has |, :, or - in it, see if
// we can clean it up. // we can clean it up.
if (TITLE_SPLITTERS_RE.test(title)) { if (TITLE_SPLITTERS_RE.test(title)) {
title = resolveSplitTitle(title) title = resolveSplitTitle(title, url)
} }
// Final sanity check that we didn't get a crazy title. // Final sanity check that we didn't get a crazy title.

@ -9,7 +9,7 @@ import {
// Given a title with separators in it (colons, dashes, etc), // Given a title with separators in it (colons, dashes, etc),
// resolve whether any of the segments should be removed. // resolve whether any of the segments should be removed.
export default function resolveSplitTitle(title, url='http://example.com') { export default function resolveSplitTitle(title, url='') {
// Splits while preserving splitters, like: // Splits while preserving splitters, like:
// ['The New New York', ' - ', 'The Washington Post'] // ['The New New York', ' - ', 'The Washington Post']
title = title title = title
@ -94,7 +94,7 @@ function cleanDomainFromTitle(splitTitle, url) {
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '') const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '')
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain) const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain)
if (endSlugRatio > .4 && endSlug.length > 5) { if (endSlugRatio > .4 && endSlug.length >= 5) {
return splitTitle.slice(0, -2).join('') return splitTitle.slice(0, -2).join('')
} }
} }

Loading…
Cancel
Save