You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
68 lines
2.3 KiB
JavaScript
68 lines
2.3 KiB
JavaScript
import cheerio from 'cheerio';
|
|
import stringDirection from 'string-direction';
|
|
|
|
import GenericContentExtractor from './content/extractor';
|
|
import GenericTitleExtractor from './title/extractor';
|
|
import GenericAuthorExtractor from './author/extractor';
|
|
import GenericDatePublishedExtractor from './date-published/extractor';
|
|
import GenericDekExtractor from './dek/extractor';
|
|
import GenericLeadImageUrlExtractor from './lead-image-url/extractor';
|
|
import GenericNextPageUrlExtractor from './next-page-url/extractor';
|
|
import GenericUrlExtractor from './url/extractor';
|
|
import GenericExcerptExtractor from './excerpt/extractor';
|
|
import GenericWordCountExtractor from './word-count/extractor';
|
|
|
|
const GenericExtractor = {
|
|
// This extractor is the default for all domains
|
|
domain: '*',
|
|
title: GenericTitleExtractor.extract,
|
|
date_published: GenericDatePublishedExtractor.extract,
|
|
author: GenericAuthorExtractor.extract,
|
|
content: GenericContentExtractor.extract.bind(GenericContentExtractor),
|
|
lead_image_url: GenericLeadImageUrlExtractor.extract,
|
|
dek: GenericDekExtractor.extract,
|
|
next_page_url: GenericNextPageUrlExtractor.extract,
|
|
url_and_domain: GenericUrlExtractor.extract,
|
|
excerpt: GenericExcerptExtractor.extract,
|
|
word_count: GenericWordCountExtractor.extract,
|
|
direction: ({ title }) => stringDirection.getDirection(title),
|
|
|
|
extract(options) {
|
|
const { html, $ } = options;
|
|
|
|
if (html && !$) {
|
|
const loaded = cheerio.load(html);
|
|
options.$ = loaded;
|
|
}
|
|
|
|
const title = this.title(options);
|
|
const date_published = this.date_published(options);
|
|
const author = this.author(options);
|
|
const content = this.content({ ...options, title });
|
|
const lead_image_url = this.lead_image_url({ ...options, content });
|
|
const dek = this.dek({ ...options, content });
|
|
const next_page_url = this.next_page_url(options);
|
|
const excerpt = this.excerpt({ ...options, content });
|
|
const word_count = this.word_count({ ...options, content });
|
|
const direction = this.direction({ title });
|
|
const { url, domain } = this.url_and_domain(options);
|
|
|
|
return {
|
|
title,
|
|
author,
|
|
date_published: date_published || null,
|
|
dek,
|
|
lead_image_url,
|
|
content,
|
|
next_page_url,
|
|
url,
|
|
domain,
|
|
excerpt,
|
|
word_count,
|
|
direction,
|
|
};
|
|
},
|
|
};
|
|
|
|
export default GenericExtractor;
|