You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/extractors/generic/date-published/constants.js

62 lines
1.7 KiB
JavaScript

// An ordered list of meta tag names that denote
// likely date published dates. All attributes
// should be lowercase for faster case-insensitive matching.
// From most distinct to least distinct.
export const DATE_PUBLISHED_META_TAGS = [
'article:published_time',
'displaydate',
'dc.date',
'dc.date.issued',
'rbpubdate',
'publish_date',
'pub_date',
'pagedate',
'pubdate',
'revision_date',
'doc_date',
'date_created',
'content_create_date',
'lastmodified',
'created',
'date',
];
// An ordered list of XPath Selectors to find
// likely date published dates. From most explicit
// to least explicit.
export const DATE_PUBLISHED_SELECTORS = [
'.hentry .dtstamp.published',
'.hentry .published',
'.hentry .dtstamp.updated',
'.hentry .updated',
'.single .published',
'.meta .published',
'.meta .postDate',
'.entry-date',
'.byline .date',
'.postmetadata .date',
'.article_datetime',
'.date-header',
'.story-date',
'.dateStamp',
'#story .datetime',
'.dateline',
'.pubdate',
];
// An ordered list of compiled regular expressions to find likely date
// published dates from the URL. These should always have the first
// reference be a date string that is parseable by dateutil.parser.parse
const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
export const DATE_PUBLISHED_URL_RES = [
// /2012/01/27/ but not /2012/01/293
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
// 20120127 or 20120127T but not 2012012733 or 8201201733
// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
// 2012-01-27
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
// /2012/jan/27/
new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'),
];