You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/index.js

68 lines
2.3 KiB
JavaScript

import cheerio from 'cheerio';
import stringDirection from 'string-direction';
import GenericContentExtractor from './content/extractor';
import GenericTitleExtractor from './title/extractor';
import GenericAuthorExtractor from './author/extractor';
import GenericDatePublishedExtractor from './date-published/extractor';
import GenericDekExtractor from './dek/extractor';
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
import GenericNextPageUrlExtractor from './next-page-url/extractor';
import GenericUrlExtractor from './url/extractor';
import GenericExcerptExtractor from './excerpt/extractor';
import GenericWordCountExtractor from './word-count/extractor';
const GenericExtractor = {
// This extractor is the default for all domains
domain: '*',
title: GenericTitleExtractor.extract,
date_published: GenericDatePublishedExtractor.extract,
author: GenericAuthorExtractor.extract,
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
lead_image_url: GenericLeadImageUrlExtractor.extract,
dek: GenericDekExtractor.extract,
next_page_url: GenericNextPageUrlExtractor.extract,
url_and_domain: GenericUrlExtractor.extract,
excerpt: GenericExcerptExtractor.extract,
word_count: GenericWordCountExtractor.extract,
direction: ({ title }) => stringDirection.getDirection(title),
extract(options) {
const { html, $ } = options;
if (html && !$) {
const loaded = cheerio.load(html);
options.$ = loaded;
}
const title = this.title(options);
const date_published = this.date_published(options);
const author = this.author(options);
const content = this.content({ ...options, title });
const lead_image_url = this.lead_image_url({ ...options, content });
const dek = this.dek({ ...options, content });
const next_page_url = this.next_page_url(options);
const excerpt = this.excerpt({ ...options, content });
const word_count = this.word_count({ ...options, content });
const direction = this.direction({ title });
const { url, domain } = this.url_and_domain(options);
return {
title,
author,
date_published: date_published || null,
dek,
lead_image_url,
content,
next_page_url,
url,
domain,
excerpt,
word_count,
direction,
};
},
};
export default GenericExtractor;