You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
101 lines
3.1 KiB
JavaScript
101 lines
3.1 KiB
JavaScript
import URL from 'url';
|
|
import 'babel-polyfill';
|
|
import wuzzy from 'wuzzy';
|
|
|
|
import {
|
|
TITLE_SPLITTERS_RE,
|
|
DOMAIN_ENDINGS_RE,
|
|
} from './constants';
|
|
|
|
function extractBreadcrumbTitle(splitTitle, text) {
|
|
// This must be a very breadcrumbed title, like:
|
|
// The Best Gadgets on Earth : Bits : Blogs : NYTimes.com
|
|
// NYTimes - Blogs - Bits - The Best Gadgets on Earth
|
|
if (splitTitle.length >= 6) {
|
|
// Look to see if we can find a breadcrumb splitter that happens
|
|
// more than once. If we can, we'll be able to better pull out
|
|
// the title.
|
|
const termCounts = splitTitle.reduce((acc, titleText) => {
|
|
acc[titleText] = acc[titleText] ? acc[titleText] + 1 : 1;
|
|
return acc;
|
|
}, {});
|
|
|
|
const [maxTerm, termCount] =
|
|
Reflect.ownKeys(termCounts)
|
|
.reduce((acc, key) => {
|
|
if (acc[1] < termCounts[key]) {
|
|
return [key, termCounts[key]];
|
|
}
|
|
|
|
return acc;
|
|
}, [0, 0]);
|
|
|
|
// We found a splitter that was used more than once, so it
|
|
// is probably the breadcrumber. Split our title on that instead.
|
|
// Note: max_term should be <= 4 characters, so that " >> "
|
|
// will match, but nothing longer than that.
|
|
if (termCount >= 2 && maxTerm.length <= 4) {
|
|
splitTitle = text.split(maxTerm);
|
|
}
|
|
|
|
const splitEnds = [splitTitle[0], splitTitle.slice(-1)];
|
|
const longestEnd = splitEnds.reduce((acc, end) => acc.length > end.length ? acc : end, '');
|
|
|
|
if (longestEnd.length > 10) {
|
|
return longestEnd;
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function cleanDomainFromTitle(splitTitle, url) {
|
|
// Search the ends of the title, looking for bits that fuzzy match
|
|
// the URL too closely. If one is found, discard it and return the
|
|
// rest.
|
|
//
|
|
// Strip out the big TLDs - it just makes the matching a bit more
|
|
// accurate. Not the end of the world if it doesn't strip right.
|
|
const { host } = URL.parse(url);
|
|
const nakedDomain = host.replace(DOMAIN_ENDINGS_RE, '');
|
|
|
|
const startSlug = splitTitle[0].toLowerCase().replace(' ', '');
|
|
const startSlugRatio = wuzzy.levenshtein(startSlug, nakedDomain);
|
|
|
|
if (startSlugRatio > 0.4 && startSlug.length > 5) {
|
|
return splitTitle.slice(2).join('');
|
|
}
|
|
|
|
const endSlug = splitTitle.slice(-1)[0].toLowerCase().replace(' ', '');
|
|
const endSlugRatio = wuzzy.levenshtein(endSlug, nakedDomain);
|
|
|
|
if (endSlugRatio > 0.4 && endSlug.length >= 5) {
|
|
return splitTitle.slice(0, -2).join('');
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
// Given a title with separators in it (colons, dashes, etc),
|
|
// resolve whether any of the segments should be removed.
|
|
export default function resolveSplitTitle(title, url = '') {
|
|
// Splits while preserving splitters, like:
|
|
// ['The New New York', ' - ', 'The Washington Post']
|
|
const splitTitle = title.split(TITLE_SPLITTERS_RE);
|
|
if (splitTitle.length === 1) {
|
|
return title;
|
|
}
|
|
|
|
let newTitle = extractBreadcrumbTitle(splitTitle, title);
|
|
if (newTitle) return newTitle;
|
|
|
|
newTitle = cleanDomainFromTitle(splitTitle, url);
|
|
if (newTitle) return newTitle;
|
|
|
|
// Fuzzy ratio didn't find anything, so this title is probably legit.
|
|
// Just return it all.
|
|
return title;
|
|
}
|