// An ordered list of meta tag names that denote // likely date published dates. All attributes // should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. export const DATE_PUBLISHED_META_TAGS = [ 'article:published_time', 'displaydate', 'dc.date', 'dc.date.issued', 'rbpubdate', 'publish_date', 'pub_date', 'pagedate', 'pubdate', 'revision_date', 'doc_date', 'date_created', 'content_create_date', 'lastmodified', 'created', 'date', ]; // An ordered list of XPath Selectors to find // likely date published dates. From most explicit // to least explicit. export const DATE_PUBLISHED_SELECTORS = [ '.hentry .dtstamp.published', '.hentry .published', '.hentry .dtstamp.updated', '.hentry .updated', '.single .published', '.meta .published', '.meta .postDate', '.entry-date', '.byline .date', '.postmetadata .date', '.article_datetime', '.date-header', '.story-date', '.dateStamp', '#story .datetime', '.dateline', '.pubdate', ]; // An ordered list of compiled regular expressions to find likely date // published dates from the URL. These should always have the first // reference be a date string that is parseable by dateutil.parser.parse const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)'; export const DATE_PUBLISHED_URL_RES = [ // /2012/01/27/ but not /2012/01/293 new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'), // 20120127 or 20120127T but not 2012012733 or 8201201733 // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i, // 2012-01-27 new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'), // /2012/jan/27/ new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'), ];