mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
feat: generic excerpt extraction
This commit is contained in:
parent
457075889d
commit
b3481a2c45
4
src/extractors/generic/excerpt/constants.js
Normal file
4
src/extractors/generic/excerpt/constants.js
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
export const EXCERPT_META_SELECTORS = [
|
||||||
|
'og:description',
|
||||||
|
'twitter:description',
|
||||||
|
];
|
28
src/extractors/generic/excerpt/extractor.js
Normal file
28
src/extractors/generic/excerpt/extractor.js
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
import ellipsize from 'ellipsize'
|
||||||
|
|
||||||
|
import {
|
||||||
|
extractFromMeta,
|
||||||
|
stripTags,
|
||||||
|
} from 'utils/dom';
|
||||||
|
|
||||||
|
import { EXCERPT_META_SELECTORS } from './constants';
|
||||||
|
|
||||||
|
export function clean(content, $, maxLength=200) {
|
||||||
|
content = content.replace(/[\s\n]+/g, ' ').trim()
|
||||||
|
return ellipsize(content, 200, { ellipse: '…' })
|
||||||
|
}
|
||||||
|
|
||||||
|
const GenericExcerptExtractor = {
|
||||||
|
extract({ $, content, metaCache }) {
|
||||||
|
const excerpt = extractFromMeta($, EXCERPT_META_SELECTORS, metaCache);
|
||||||
|
if (excerpt) {
|
||||||
|
return clean(stripTags(excerpt, $));
|
||||||
|
}
|
||||||
|
// Fall back to excerpting from the extracted content
|
||||||
|
const maxLength = 200
|
||||||
|
const shortContent = content.slice(0, maxLength * 5)
|
||||||
|
return clean($(shortContent).text(), $, maxLength)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export default GenericExcerptExtractor
|
84
src/extractors/generic/excerpt/extractor.test.js
Normal file
84
src/extractors/generic/excerpt/extractor.test.js
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import assert from 'assert'
|
||||||
|
import cheerio from 'cheerio'
|
||||||
|
|
||||||
|
import {
|
||||||
|
default as GenericExcerptExtractor,
|
||||||
|
clean,
|
||||||
|
} from './extractor'
|
||||||
|
|
||||||
|
describe('GenericExcerptExtractor', () => {
|
||||||
|
describe('extract({ $, content, metaCache })', () => {
|
||||||
|
it('returns og:description', () => {
|
||||||
|
const actualExcerpt = "Wow this is going to be something good."
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta name="og:description" value="${actualExcerpt}" />
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const metaCache = ['og:description'];
|
||||||
|
|
||||||
|
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
|
||||||
|
|
||||||
|
assert.equal(excerpt, actualExcerpt);
|
||||||
|
})
|
||||||
|
|
||||||
|
it('returns twitter:description', () => {
|
||||||
|
const actualExcerpt = "Wow this is going to be something good."
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta name="twitter:description" value="${actualExcerpt}" />
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const metaCache = ['twitter:description'];
|
||||||
|
|
||||||
|
const excerpt = GenericExcerptExtractor.extract({ $, content: '', metaCache });
|
||||||
|
|
||||||
|
assert.equal(excerpt, actualExcerpt);
|
||||||
|
})
|
||||||
|
|
||||||
|
it('falls back to the content', () => {
|
||||||
|
const html = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
const $ = cheerio.load(html);
|
||||||
|
const content = "<div><p>Wow <b>this</b> is going to be something good.</p></div>"
|
||||||
|
const metaCache = [];
|
||||||
|
|
||||||
|
const excerpt = GenericExcerptExtractor.extract({ $, content, metaCache });
|
||||||
|
|
||||||
|
assert.equal(excerpt, 'Wow this is going to be something good.');
|
||||||
|
})
|
||||||
|
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
describe('clean(text)', () => {
|
||||||
|
it('truncates text longer than 200 chars and trims whitespance', () => {
|
||||||
|
const longText = `
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
|
||||||
|
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis
|
||||||
|
nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.
|
||||||
|
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu
|
||||||
|
fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in
|
||||||
|
culpa qui officia deserunt mollit anim id est laborum.
|
||||||
|
`
|
||||||
|
const text = clean(longText)
|
||||||
|
let shouldBe = `
|
||||||
|
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor
|
||||||
|
incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud
|
||||||
|
exercitation ullamco laboris nisi ut…
|
||||||
|
`
|
||||||
|
shouldBe = shouldBe.replace(/[\s\n]+/g, ' ').trim()
|
||||||
|
|
||||||
|
assert.equal(text, shouldBe)
|
||||||
|
})
|
||||||
|
})
|
@ -8,6 +8,7 @@ import GenericDekExtractor from './dek/extractor';
|
|||||||
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
|
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
|
||||||
import GenericNextPageUrlExtractor from './next-page-url/extractor';
|
import GenericNextPageUrlExtractor from './next-page-url/extractor';
|
||||||
import GenericUrlExtractor from './url/extractor';
|
import GenericUrlExtractor from './url/extractor';
|
||||||
|
import GenericExcerptExtractor from './excerpt/extractor';
|
||||||
|
|
||||||
const GenericExtractor = {
|
const GenericExtractor = {
|
||||||
// This extractor is the default for all domains
|
// This extractor is the default for all domains
|
||||||
@ -20,6 +21,7 @@ const GenericExtractor = {
|
|||||||
dek: GenericDekExtractor.extract,
|
dek: GenericDekExtractor.extract,
|
||||||
nextPageUrl: GenericNextPageUrlExtractor.extract,
|
nextPageUrl: GenericNextPageUrlExtractor.extract,
|
||||||
urlAndDomain: GenericUrlExtractor.extract,
|
urlAndDomain: GenericUrlExtractor.extract,
|
||||||
|
excerpt: GenericExcerptExtractor.extract,
|
||||||
|
|
||||||
extract(options) {
|
extract(options) {
|
||||||
const { html } = options;
|
const { html } = options;
|
||||||
@ -33,9 +35,10 @@ const GenericExtractor = {
|
|||||||
const datePublished = this.datePublished(options);
|
const datePublished = this.datePublished(options);
|
||||||
const author = this.author(options);
|
const author = this.author(options);
|
||||||
const content = this.content({ ...options, title });
|
const content = this.content({ ...options, title });
|
||||||
const leadImageUrl = this.leadImageUrl(options);
|
const leadImageUrl = this.leadImageUrl({ ...options, content });
|
||||||
const dek = this.dek(options);
|
const dek = this.dek({ ...options, content });
|
||||||
const nextPageUrl = this.nextPageUrl(options);
|
const nextPageUrl = this.nextPageUrl(options);
|
||||||
|
const excerpt = this.excerpt({ ...options, content });
|
||||||
const { url, domain } = this.urlAndDomain(options);
|
const { url, domain } = this.urlAndDomain(options);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
@ -48,6 +51,7 @@ const GenericExtractor = {
|
|||||||
nextPageUrl,
|
nextPageUrl,
|
||||||
url,
|
url,
|
||||||
domain,
|
domain,
|
||||||
|
excerpt,
|
||||||
};
|
};
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -1,9 +1,7 @@
|
|||||||
import URL from 'url';
|
import URL from 'url';
|
||||||
import { extractFromMeta } from 'utils/dom';
|
import { extractFromMeta } from 'utils/dom';
|
||||||
|
|
||||||
import {
|
import { CANONICAL_META_SELECTORS } from './constants';
|
||||||
CANONICAL_META_SELECTORS,
|
|
||||||
} from './constants';
|
|
||||||
|
|
||||||
function parseDomain(url) {
|
function parseDomain(url) {
|
||||||
const parsedUrl = URL.parse(url);
|
const parsedUrl = URL.parse(url);
|
||||||
|
@ -128,6 +128,7 @@ const RootExtractor = {
|
|||||||
});
|
});
|
||||||
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
|
const leadImageUrl = extractResult({ ...opts, type: 'leadImageUrl', content });
|
||||||
const dek = extractResult({ ...opts, type: 'dek', content });
|
const dek = extractResult({ ...opts, type: 'dek', content });
|
||||||
|
const excerpt = extractResult({ ...opts, type: 'excerpt', content });
|
||||||
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
|
const { url, domain } = extractResult({ ...opts, type: 'urlAndDomain' });
|
||||||
return {
|
return {
|
||||||
title,
|
title,
|
||||||
@ -139,6 +140,7 @@ const RootExtractor = {
|
|||||||
nextPageUrl,
|
nextPageUrl,
|
||||||
url,
|
url,
|
||||||
domain,
|
domain,
|
||||||
|
excerpt,
|
||||||
};
|
};
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user