feat: added domain and url extractor (using same extractor)
commit 43ab423d575cd15cc55041fb3fe2f21ffdd7adff Author: Adam Pash <adam.pash@gmail.com> Date: Wed Sep 14 11:57:25 2016 -0400pull/3/head
parent
67296691c2
commit
f3a5d0ecca
@ -0,0 +1,3 @@
|
||||
export const CANONICAL_META_SELECTORS = [
|
||||
'og:url',
|
||||
];
|
@ -0,0 +1,41 @@
|
||||
import URL from 'url';
|
||||
import { extractFromMeta } from 'utils/dom';
|
||||
|
||||
import {
|
||||
CANONICAL_META_SELECTORS,
|
||||
} from './constants';
|
||||
|
||||
function parseDomain(url) {
|
||||
const parsedUrl = URL.parse(url);
|
||||
const { hostname } = parsedUrl;
|
||||
return hostname;
|
||||
}
|
||||
|
||||
function result(url) {
|
||||
return {
|
||||
url,
|
||||
domain: parseDomain(url),
|
||||
};
|
||||
}
|
||||
|
||||
const GenericUrlExtractor = {
|
||||
extract({ $, url, metaCache }) {
|
||||
const $canonical = $('link[rel=canonical]');
|
||||
if ($canonical.length !== 0) {
|
||||
const href = $canonical.attr('href');
|
||||
if (href) {
|
||||
return result(href);
|
||||
}
|
||||
}
|
||||
|
||||
const metaUrl = extractFromMeta($, CANONICAL_META_SELECTORS, metaCache);
|
||||
if (metaUrl) {
|
||||
return result(metaUrl);
|
||||
}
|
||||
|
||||
return result(url);
|
||||
},
|
||||
|
||||
};
|
||||
|
||||
export default GenericUrlExtractor;
|
@ -0,0 +1,63 @@
|
||||
import assert from 'assert';
|
||||
import cheerio from 'cheerio';
|
||||
|
||||
import GenericUrlExtractor from './extractor';
|
||||
|
||||
describe('GenericUrlExtractor', () => {
|
||||
describe('extract({ $, url })', () => {
|
||||
it('returns canonical url and domain first', () => {
|
||||
const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj';
|
||||
const clean = 'https://example.com/blog/post';
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<link rel="canonical" href="${clean}" />
|
||||
<meta name="og:url" value="${clean}" />
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
|
||||
const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl });
|
||||
|
||||
assert.equal(url, clean);
|
||||
assert.equal(domain, 'example.com');
|
||||
});
|
||||
|
||||
it('returns og:url second', () => {
|
||||
const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj';
|
||||
const clean = 'https://example.com/blog/post';
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="og:url" value="${clean}" />
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const metaCache = ['og:url'];
|
||||
|
||||
const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl, metaCache });
|
||||
|
||||
assert.equal(url, clean);
|
||||
assert.equal(domain, 'example.com');
|
||||
});
|
||||
|
||||
it('returns passed url if others are not found', () => {
|
||||
const fullUrl = 'https://example.com/blog/post?utm_campain=poajwefpaoiwjefaepoj';
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const metaCache = [];
|
||||
|
||||
const { url, domain } = GenericUrlExtractor.extract({ $, url: fullUrl, metaCache });
|
||||
|
||||
assert.equal(url, fullUrl);
|
||||
assert.equal(domain, 'example.com');
|
||||
});
|
||||
});
|
||||
});
|
Loading…
Reference in New Issue