diff --git a/src/extractor/generic/index.js b/src/extractor/generic/index.js index f264d788..d31f7a77 100644 --- a/src/extractor/generic/index.js +++ b/src/extractor/generic/index.js @@ -4,15 +4,24 @@ import GenericContentExtractor from './content/extractor' import GenericTitleExtractor from './title/extractor' const GenericExtractor = { - parse: (html) => { - let $ = cheerio.load(html) + parse: (url, html) => { + if (html) { + let $ = cheerio.load(html) + } else { + // TODO + // Fetch link, following redirects + // to return html and initialize $ + } + // Cached value of every meta name in our document. // Used when extracting title/author/date_published/dek - const metaCache = $('meta').map((index, node) => $(node).attr('name')) + const metaCache = $('meta').map((index, node) => { + return $(node).attr('name') + }).toArray() - const title = GenericTitleExtractor.extract($, metaCache) + const title = GenericTitleExtractor.extract($, url, metaCache) return { - content: GenericContentExtractor.parse(html), + content: GenericContentExtractor.parse($, html), title: title, } } diff --git a/src/extractor/generic/index.test.js b/src/extractor/generic/index.test.js new file mode 100644 index 00000000..343a511d --- /dev/null +++ b/src/extractor/generic/index.test.js @@ -0,0 +1,26 @@ +import assert from 'assert' +import cheerio from 'cheerio' +import fs from 'fs' + +import { clean } from './content/utils/dom/test-helpers' + +import GenericExtractor from './index' + +describe('GenericExtractor', () => { + describe('parse(html)', () => { + it("parses html and returns the article title", () => { + const html = fs.readFileSync('../fixtures/latimes.html', 'utf-8') + + const { title } = GenericExtractor.parse("http://latimes.com", html) + assert.equal(title, 'California appears poised to be first to ban power-guzzling big-screen TVs') + }) + + it("parses html and returns the article title", () => { + const html = fs.readFileSync('../fixtures/wired.html', 'utf-8') + + const { title } = GenericExtractor.parse("http://wired.com", html) + assert.equal(title, 'Airplane Tires Don’t Explode on Landing Because They Are Pumped!') + }) + }) +}) + diff --git a/src/extractor/generic/title/extractor.js b/src/extractor/generic/title/extractor.js index eab3255b..b2aad2d4 100644 --- a/src/extractor/generic/title/extractor.js +++ b/src/extractor/generic/title/extractor.js @@ -11,26 +11,26 @@ import { } from '../utils' const GenericTitleExtractor = { - extract($, cachedMeta) { + extract($, url, cachedMeta) { // First, check to see if we have a matching meta tag that we can make // use of that is strongly associated with the headline. let title title = extractFromMeta($, STRONG_TITLE_META_TAGS, cachedMeta) - if (title) return cleanTitle(title) + if (title) return cleanTitle(title, url, $) // Second, look through our content selectors for the most likely // article title that is strongly associated with the headline. title = extractFromSelectors($, STRONG_TITLE_SELECTORS) - if (title) return cleanTitle(title) + if (title) return cleanTitle(title, url, $) // Third, check for weaker meta tags that may match. title = extractFromMeta($, WEAK_TITLE_META_TAGS, cachedMeta) - if (title) return cleanTitle(title) + if (title) return cleanTitle(title, url, $) // Last, look for weaker selector tags that may match. title = extractFromSelectors($, WEAK_TITLE_SELECTORS) - if (title) return cleanTitle(title) + if (title) return cleanTitle(title, url, $) // If no matches, return an empty string return "" diff --git a/src/extractor/generic/title/utils/clean-title.js b/src/extractor/generic/title/utils/clean-title.js index 110fca6e..da59603d 100644 --- a/src/extractor/generic/title/utils/clean-title.js +++ b/src/extractor/generic/title/utils/clean-title.js @@ -2,11 +2,11 @@ import { TITLE_SPLITTERS_RE } from '../constants' import { resolveSplitTitle } from './index' import { stripTags } from '../../../utils' -export default function cleanTitle(title, $) { +export default function cleanTitle(title, url, $) { // If title has |, :, or - in it, see if // we can clean it up. if (TITLE_SPLITTERS_RE.test(title)) { - title = resolveSplitTitle(title) + title = resolveSplitTitle(title, url) } // Final sanity check that we didn't get a crazy title. diff --git a/src/extractor/generic/title/utils/resolve-split-title.js b/src/extractor/generic/title/utils/resolve-split-title.js index ccd78233..d62552e1 100644 --- a/src/extractor/generic/title/utils/resolve-split-title.js +++ b/src/extractor/generic/title/utils/resolve-split-title.js @@ -9,7 +9,7 @@ import { // Given a title with separators in it (colons, dashes, etc), // resolve whether any of the segments should be removed. -export default function resolveSplitTitle(title, url='http://example.com') { +export default function resolveSplitTitle(title, url='') { // Splits while preserving splitters, like: // ['The New New York', ' - ', 'The Washington Post'] title = title @@ -94,7 +94,7 @@ function cleanDomainFromTitle(splitTitle, url) { const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '') const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain) - if (endSlugRatio > .4 && endSlug.length > 5) { + if (endSlugRatio > .4 && endSlug.length >= 5) { return splitTitle.slice(0, -2).join('') } }