diff --git a/TODO.md b/TODO.md index 3a94a6dc..23e62120 100644 --- a/TODO.md +++ b/TODO.md @@ -1,14 +1,14 @@ TODO: - Complete response: - - add canonicalUrl - add excerpt - - add domain - add word count - add total pages - add rendered pages - Test if .is method is faster than regex methods DONE: +x add canonicalUrl +x add domain x Separate constants into activity-specific folders (dom, scoring) x extractNextPageUrl x Make sure weightNodes flag is being passed properly diff --git a/src/extractors/collect-all-pages.js b/src/extractors/collect-all-pages.js index 4ba503c5..d297b3ee 100644 --- a/src/extractors/collect-all-pages.js +++ b/src/extractors/collect-all-pages.js @@ -1,5 +1,6 @@ import 'babel-polyfill'; import { removeAnchor } from 'utils/text'; +import RootExtractor from 'extractors/root-extractor'; import Resource from 'resource'; import Iris from '../iris'; @@ -34,7 +35,7 @@ export default async function collectAllPages( previousUrls, }; - const nextPageResult = Iris.runExtraction(Extractor, extractorOpts); + const nextPageResult = RootExtractor.extract(Extractor, extractorOpts); previousUrls.push(nextPageUrl); result = { diff --git a/src/extractors/generic/index.js b/src/extractors/generic/index.js index 90c39bf3..0bee6ecd 100644 --- a/src/extractors/generic/index.js +++ b/src/extractors/generic/index.js @@ -7,6 +7,7 @@ import GenericDatePublishedExtractor from './date-published/extractor'; import GenericDekExtractor from './dek/extractor'; import GenericLeadImageUrlExtractor from './lead-image-url/extractor'; import GenericNextPageUrlExtractor from './next-page-url/extractor'; +import GenericUrlExtractor from './url/extractor'; const GenericExtractor = { // This extractor is the default for all domains @@ -18,6 +19,7 @@ const GenericExtractor = { leadImageUrl: GenericLeadImageUrlExtractor.extract, dek: GenericDekExtractor.extract, nextPageUrl: GenericNextPageUrlExtractor.extract, + urlAndDomain: GenericUrlExtractor.extract, extract(options) { const { html } = options; @@ -34,6 +36,7 @@ const GenericExtractor = { const leadImageUrl = this.leadImageUrl(options); const dek = this.dek(options); const nextPageUrl = this.nextPageUrl(options); + const { url, domain } = this.urlAndDomain(options); return { title, @@ -43,6 +46,8 @@ const GenericExtractor = { leadImageUrl, content, nextPageUrl, + url, + domain, }; }, }; diff --git a/src/extractors/generic/url/constants.js b/src/extractors/generic/url/constants.js new file mode 100644 index 00000000..8015d0cd --- /dev/null +++ b/src/extractors/generic/url/constants.js @@ -0,0 +1,3 @@ +export const CANONICAL_META_SELECTORS = [ + 'og:url', +]; diff --git a/src/extractors/generic/url/extractor.js b/src/extractors/generic/url/extractor.js new file mode 100644 index 00000000..8347abd0 --- /dev/null +++ b/src/extractors/generic/url/extractor.js @@ -0,0 +1,41 @@ +import URL from 'url'; +import { extractFromMeta } from 'utils/dom'; + +import { + CANONICAL_META_SELECTORS, +} from './constants'; + +function parseDomain(url) { + const parsedUrl = URL.parse(url); + const { hostname } = parsedUrl; + return hostname; +} + +function result(url) { + return { + url, + domain: parseDomain(url), + }; +} + +const GenericUrlExtractor = { + extract({ $, url, metaCache }) { + const $canonical = $('link[rel=canonical]'); + if ($canonical.length !== 0) { + const href = $canonical.attr('href'); + if (href) { + return result(href); + } + } + + const metaUrl = extractFromMeta($, CANONICAL_META_SELECTORS, metaCache); + if (metaUrl) { + return result(metaUrl); + } + + return result(url); + }, + +}; + +export default GenericUrlExtractor; diff --git a/src/extractors/generic/url/extractor.test.js b/src/extractors/generic/url/extractor.test.js new file mode 100644 index 00000000..0b02fe8e --- /dev/null +++ b/src/extractors/generic/url/extractor.test.js @@ -0,0 +1,63 @@ +import assert from 'assert'; +import cheerio from 'cheerio'; + +import GenericUrlExtractor from './extractor'; + +describe('GenericUrlExtractor', () => { + describe('extract({ $, url })', () => { + it('returns canonical url and domain first', () => { + const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj'; + const clean = 'https://example.com/blog/post'; + const html = ` + + + + + + + `; + const $ = cheerio.load(html); + + const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl }); + + assert.equal(url, clean); + assert.equal(domain, 'example.com'); + }); + + it('returns og:url second', () => { + const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj'; + const clean = 'https://example.com/blog/post'; + const html = ` + + + + + + `; + const $ = cheerio.load(html); + const metaCache = ['og:url']; + + const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl, metaCache }); + + assert.equal(url, clean); + assert.equal(domain, 'example.com'); + }); + + it('returns passed url if others are not found', () => { + const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj'; + const html = ` + + + + + `; + const $ = cheerio.load(html); + const metaCache = []; + + const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl, metaCache }); + + assert.equal(url, fullUrl); + assert.equal(domain, 'example.com'); + }); + }); +}); diff --git a/src/extractors/root-extractor.js b/src/extractors/root-extractor.js index 51c32f04..ca6a8805 100644 --- a/src/extractors/root-extractor.js +++ b/src/extractors/root-extractor.js @@ -128,6 +128,7 @@ const RootExtractor = { }); const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content }); const dek = extractResult({ ...opts, type: 'dek', content }); + const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' }); return { title, content, @@ -136,6 +137,8 @@ const RootExtractor = { leadImageUrl, dek, nextPageUrl, + url, + domain, }; }, }; diff --git a/src/extractors/root-extractor.test.js b/src/extractors/root-extractor.test.js index 347b1637..76b1199a 100644 --- a/src/extractors/root-extractor.test.js +++ b/src/extractors/root-extractor.test.js @@ -14,17 +14,19 @@ import NYMagExtractor from './custom/nymag.com'; describe('RootExtractor', () => { it('extracts based on custom selectors', () => { - const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'; + const fullUrl = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html'; const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8'); const $ = cheerio.load(html); const { + url, title, } = RootExtractor.extract( - NYMagExtractor, { url, html, $, metaCache: [] } + NYMagExtractor, { url: fullUrl, html, $, metaCache: [] } ); assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation'); + assert.equal(url, fullUrl); }); }); diff --git a/src/iris.js b/src/iris.js index c6f54d62..d0be2a78 100644 --- a/src/iris.js +++ b/src/iris.js @@ -16,7 +16,7 @@ const Iris = { // Used when extracting title/author/date_published/dek const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray(); - let result = this.runExtraction(Extractor, { url, html, $, metaCache }); + let result = RootExtractor.extract(Extractor, { url, html, $, metaCache }); const { title, nextPageUrl } = result; if (fetchAllPages && nextPageUrl) { @@ -28,6 +28,7 @@ const Iris = { $, metaCache, result, + Extractor, title, url, } @@ -37,10 +38,6 @@ const Iris = { return result; }, - runExtraction(Extractor, opts) { - return RootExtractor.extract(Extractor, opts); - }, - }; export default Iris; diff --git a/src/iris.test.js b/src/iris.test.js index 39da5c35..db4fce6a 100644 --- a/src/iris.test.js +++ b/src/iris.test.js @@ -16,7 +16,6 @@ describe('Iris', () => { const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html'); assert.equal(typeof result, 'object'); - // console.log(result) }); it('does wikipedia', async function() {