You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/date-published/extractor.js

37 lines
1.1 KiB
JavaScript

import { cleanDatePublished } from 'cleaners';
import {
extractFromMeta,
extractFromSelectors,
} from 'utils/dom';
import { extractFromUrl } from 'utils/text';
import {
DATE_PUBLISHED_META_TAGS,
DATE_PUBLISHED_SELECTORS,
DATE_PUBLISHED_URL_RES,
} from './constants';
const GenericDatePublishedExtractor = {
extract({ $, url, metaCache }) {
let datePublished;
// First, check to see if we have a matching meta tag
// that we can make use of.
// Don't try cleaning tags from this string
datePublished = extractFromMeta($, DATE_PUBLISHED_META_TAGS, metaCache, false);
if (datePublished) return cleanDatePublished(datePublished);
// Second, look through our selectors looking for potential
// date_published's.
datePublished = extractFromSelectors($, DATE_PUBLISHED_SELECTORS);
if (datePublished) return cleanDatePublished(datePublished);
// Lastly, look to see if a dately string exists in the URL
datePublished = extractFromUrl(url, DATE_PUBLISHED_URL_RES);
if (datePublished) return cleanDatePublished(datePublished);
return null;
},
};
export default GenericDatePublishedExtractor;