feat: added domain and url extractor (using same extractor)

commit 43ab423d575cd15cc55041fb3fe2f21ffdd7adff
Author: Adam Pash <adam.pash@gmail.com>
Date:   Wed Sep 14 11:57:25 2016 -0400
pull/3/head
Adam Pash 8 years ago
parent 67296691c2
commit f3a5d0ecca

@ -1,14 +1,14 @@
TODO:
- Complete response:
- add canonicalUrl
- add excerpt
- add domain
- add word count
- add total pages
- add rendered pages
- Test if .is method is faster than regex methods
DONE:
x add canonicalUrl
x add domain
x Separate constants into activity-specific folders (dom, scoring)
x extractNextPageUrl
x Make sure weightNodes flag is being passed properly

@ -1,5 +1,6 @@
import 'babel-polyfill';
import { removeAnchor } from 'utils/text';
import RootExtractor from 'extractors/root-extractor';
import Resource from 'resource';
import Iris from '../iris';
@ -34,7 +35,7 @@ export default async function collectAllPages(
previousUrls,
};
const nextPageResult = Iris.runExtraction(Extractor, extractorOpts);
const nextPageResult = RootExtractor.extract(Extractor, extractorOpts);
previousUrls.push(nextPageUrl);
result = {

@ -7,6 +7,7 @@ import GenericDatePublishedExtractor from './date-published/extractor';
import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor';
import GenericUrlExtractor from './url/extractor';
const GenericExtractor = {
// This extractor is the default for all domains
@ -18,6 +19,7 @@ const GenericExtractor = {
leadImageUrl: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
nextPageUrl: GenericNextPageUrlExtractor.extract,
urlAndDomain: GenericUrlExtractor.extract,
extract(options) {
const { html } = options;
@ -34,6 +36,7 @@ const GenericExtractor = {
const leadImageUrl = this.leadImageUrl(options);
const dek = this.dek(options);
const nextPageUrl = this.nextPageUrl(options);
const { url, domain } = this.urlAndDomain(options);
return {
title,
@ -43,6 +46,8 @@ const GenericExtractor = {
leadImageUrl,
content,
nextPageUrl,
url,
domain,
};
},
};

@ -0,0 +1,3 @@
export const CANONICAL_META_SELECTORS = [
'og:url',
];

@ -0,0 +1,41 @@
import URL from 'url';
import { extractFromMeta } from 'utils/dom';
import {
CANONICAL_META_SELECTORS,
} from './constants';
function parseDomain(url) {
const parsedUrl = URL.parse(url);
const { hostname } = parsedUrl;
return hostname;
}
function result(url) {
return {
url,
domain: parseDomain(url),
};
}
const GenericUrlExtractor = {
extract({ $, url, metaCache }) {
const $canonical = $('link[rel=canonical]');
if ($canonical.length !== 0) {
const href = $canonical.attr('href');
if (href) {
return result(href);
}
}
const metaUrl = extractFromMeta($, CANONICAL_META_SELECTORS, metaCache);
if (metaUrl) {
return result(metaUrl);
}
return result(url);
},
};
export default GenericUrlExtractor;

@ -0,0 +1,63 @@
import assert from 'assert';
import cheerio from 'cheerio';
import GenericUrlExtractor from './extractor';
describe('GenericUrlExtractor', () => {
describe('extract({ $, url })', () => {
it('returns canonical url and domain first', () => {
const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj';
const clean = 'https://example.com/blog/post';
const html = `
<html>
<head>
<link rel="canonical" href="${clean}" />
<meta name="og:url" value="${clean}" />
</head>
</html>
`;
const $ = cheerio.load(html);
const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl });
assert.equal(url, clean);
assert.equal(domain, 'example.com');
});
it('returns og:url second', () => {
const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj';
const clean = 'https://example.com/blog/post';
const html = `
<html>
<head>
<meta name="og:url" value="${clean}" />
</head>
</html>
`;
const $ = cheerio.load(html);
const metaCache = ['og:url'];
const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl, metaCache });
assert.equal(url, clean);
assert.equal(domain, 'example.com');
});
it('returns passed url if others are not found', () => {
const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj';
const html = `
<html>
<head>
</head>
</html>
`;
const $ = cheerio.load(html);
const metaCache = [];
const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl, metaCache });
assert.equal(url, fullUrl);
assert.equal(domain, 'example.com');
});
});
});

@ -128,6 +128,7 @@ const RootExtractor = {
});
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
const dek = extractResult({ ...opts, type: 'dek', content });
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
return {
title,
content,
@ -136,6 +137,8 @@ const RootExtractor = {
leadImageUrl,
dek,
nextPageUrl,
url,
domain,
};
},
};

@ -14,17 +14,19 @@ import NYMagExtractor from './custom/nymag.com';
describe('RootExtractor', () => {
it('extracts based on custom selectors', () => {
const url = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const fullUrl = 'http://nymag.com/daily/intelligencer/2016/09/trump-discussed-usd25k-donation-with-florida-ag-not-fraud.html';
const html = fs.readFileSync('./src/extractors/custom/nymag.com/fixtures/test.html', 'utf8');
const $ = cheerio.load(html);
const {
url,
title,
} = RootExtractor.extract(
NYMagExtractor, { url, html, $, metaCache: [] }
NYMagExtractor, { url: fullUrl, html, $, metaCache: [] }
);
assert.equal(title, 'Trump Claims He Discussed $25K Donation With Florida Attorney General, But Not Trump University Investigation');
assert.equal(url, fullUrl);
});
});

@ -16,7 +16,7 @@ const Iris = {
// Used when extracting title/author/date_published/dek
const metaCache = $('meta').map((_, node) => $(node).attr('name')).toArray();
let result = this.runExtraction(Extractor, { url, html, $, metaCache });
let result = RootExtractor.extract(Extractor, { url, html, $, metaCache });
const { title, nextPageUrl } = result;
if (fetchAllPages && nextPageUrl) {
@ -28,6 +28,7 @@ const Iris = {
$,
metaCache,
result,
Extractor,
title,
url,
}
@ -37,10 +38,6 @@ const Iris = {
return result;
},
runExtraction(Extractor, opts) {
return RootExtractor.extract(Extractor, opts);
},
};
export default Iris;

@ -16,7 +16,6 @@ describe('Iris', () => {
const result = await Iris.parse('https://googleblog.blogspot.com/2016/08/onhub-turns-one-today.html');
assert.equal(typeof result, 'object');
// console.log(result)
});
it('does wikipedia', async function() {

Loading…
Cancel
Save