mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
feat: generic excerpt extraction
This commit is contained in:
parent
457075889d
commit
b3481a2c45
4
src/extractors/generic/excerpt/constants.js
Normal file
4
src/extractors/generic/excerpt/constants.js
Normal file
@ -0,0 +1,4 @@
|
||||
export const EXCERPT_META_SELECTORS = [
|
||||
'og:description',
|
||||
'twitter:description',
|
||||
];
|
28
src/extractors/generic/excerpt/extractor.js
Normal file
28
src/extractors/generic/excerpt/extractor.js
Normal file
@ -0,0 +1,28 @@
|
||||
import ellipsize from 'ellipsize'
|
||||
|
||||
import {
|
||||
extractFromMeta,
|
||||
stripTags,
|
||||
} from 'utils/dom';
|
||||
|
||||
import { EXCERPT_META_SELECTORS } from './constants';
|
||||
|
||||
export function clean(content, $, maxLength=200) {
|
||||
content = content.replace(/[\s\n]+/g, ' ').trim()
|
||||
return ellipsize(content, 200, { ellipse: '…' })
|
||||
}
|
||||
|
||||
const GenericExcerptExtractor = {
|
||||
extract({ $, content, metaCache }) {
|
||||
const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache);
|
||||
if (excerpt) {
|
||||
return clean(stripTags(excerpt, $));
|
||||
}
|
||||
// Fall back to excerpting from the extracted content
|
||||
const maxLength = 200
|
||||
const shortContent = content.slice(0, maxLength * 5)
|
||||
return clean($(shortContent).text(), $, maxLength)
|
||||
}
|
||||
}
|
||||
|
||||
export default GenericExcerptExtractor
|
84
src/extractors/generic/excerpt/extractor.test.js
Normal file
84
src/extractors/generic/excerpt/extractor.test.js
Normal file
@ -0,0 +1,84 @@
|
||||
import assert from 'assert'
|
||||
import cheerio from 'cheerio'
|
||||
|
||||
import {
|
||||
default as GenericExcerptExtractor,
|
||||
clean,
|
||||
} from './extractor'
|
||||
|
||||
describe('GenericExcerptExtractor', () => {
|
||||
describe('extract({ $, content, metaCache })', () => {
|
||||
it('returns og:description', () => {
|
||||
const actualExcerpt = "Wow this is going to be something good."
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="og:description" value="${actualExcerpt}" />
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const metaCache = ['og:description'];
|
||||
|
||||
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
|
||||
|
||||
assert.equal(excerpt, actualExcerpt);
|
||||
})
|
||||
|
||||
it('returns twitter:description', () => {
|
||||
const actualExcerpt = "Wow this is going to be something good."
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
<meta name="twitter:description" value="${actualExcerpt}" />
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const metaCache = ['twitter:description'];
|
||||
|
||||
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
|
||||
|
||||
assert.equal(excerpt, actualExcerpt);
|
||||
})
|
||||
|
||||
it('falls back to the content', () => {
|
||||
const html = `
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
</html>
|
||||
`;
|
||||
const $ = cheerio.load(html);
|
||||
const content = "<div><p>Wow <b>this</b> is going to be something good.</p></div>"
|
||||
const metaCache = [];
|
||||
|
||||
const excerpt = GenericExcerptExtractor.extract({ $, content, metaCache });
|
||||
|
||||
assert.equal(excerpt, 'Wow this is going to be something good.');
|
||||
})
|
||||
|
||||
})
|
||||
})
|
||||
|
||||
describe('clean(text)', () => {
|
||||
it('truncates text longer than 200 chars and trims whitespance', () => {
|
||||
const longText = `
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
|
||||
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
|
||||
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
|
||||
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
|
||||
culpa qui officia deserunt mollit anim id est laborum.
|
||||
`
|
||||
const text = clean(longText)
|
||||
let shouldBe = `
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
|
||||
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
|
||||
exercitation ullamco laboris nisi ut…
|
||||
`
|
||||
shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim()
|
||||
|
||||
assert.equal(text, shouldBe)
|
||||
})
|
||||
})
|
@ -8,6 +8,7 @@ import GenericDekExtractor from './dek/extractor';
|
||||
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
|
||||
import GenericNextPageUrlExtractor from './next-page-url/extractor';
|
||||
import GenericUrlExtractor from './url/extractor';
|
||||
import GenericExcerptExtractor from './excerpt/extractor';
|
||||
|
||||
const GenericExtractor = {
|
||||
// This extractor is the default for all domains
|
||||
@ -20,6 +21,7 @@ const GenericExtractor = {
|
||||
dek: GenericDekExtractor.extract,
|
||||
nextPageUrl: GenericNextPageUrlExtractor.extract,
|
||||
urlAndDomain: GenericUrlExtractor.extract,
|
||||
excerpt: GenericExcerptExtractor.extract,
|
||||
|
||||
extract(options) {
|
||||
const { html } = options;
|
||||
@ -33,9 +35,10 @@ const GenericExtractor = {
|
||||
const datePublished = this.datePublished(options);
|
||||
const author = this.author(options);
|
||||
const content = this.content({ ...options, title });
|
||||
const leadImageUrl = this.leadImageUrl(options);
|
||||
const dek = this.dek(options);
|
||||
const leadImageUrl = this.leadImageUrl({ ...options, content });
|
||||
const dek = this.dek({ ...options, content });
|
||||
const nextPageUrl = this.nextPageUrl(options);
|
||||
const excerpt = this.excerpt({ ...options, content });
|
||||
const { url, domain } = this.urlAndDomain(options);
|
||||
|
||||
return {
|
||||
@ -48,6 +51,7 @@ const GenericExtractor = {
|
||||
nextPageUrl,
|
||||
url,
|
||||
domain,
|
||||
excerpt,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
@ -1,9 +1,7 @@
|
||||
import URL from 'url';
|
||||
import { extractFromMeta } from 'utils/dom';
|
||||
|
||||
import {
|
||||
CANONICAL_META_SELECTORS,
|
||||
} from './constants';
|
||||
import { CANONICAL_META_SELECTORS } from './constants';
|
||||
|
||||
function parseDomain(url) {
|
||||
const parsedUrl = URL.parse(url);
|
||||
|
@ -128,6 +128,7 @@ const RootExtractor = {
|
||||
});
|
||||
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
|
||||
const dek = extractResult({ ...opts, type: 'dek', content });
|
||||
const excerpt = extractResult({ ...opts, type: 'excerpt', content });
|
||||
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
|
||||
return {
|
||||
title,
|
||||
@ -139,6 +140,7 @@ const RootExtractor = {
|
||||
nextPageUrl,
|
||||
url,
|
||||
domain,
|
||||
excerpt,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user