diff --git a/src/cleaners/constants.js b/src/cleaners/constants.js index 63d7a181..e8415a96 100644 --- a/src/cleaners/constants.js +++ b/src/cleaners/constants.js @@ -26,6 +26,8 @@ export const DEK_SELECTORS = [ ]; // CLEAN DATE PUBLISHED CONSTANTS +export const MS_DATE_STRING = /^\d{13}$/i; +export const SEC_DATE_STRING = /^\d{10}$/i; export const CLEAN_DATE_STRING_RE = /^\s*published\s*:?\s*(.*)/i; export const TIME_MERIDIAN_SPACE_RE = /(.*\d)(am|pm)(.*)/i; export const TIME_MERIDIAN_DOTS_RE = /\.m\./i; diff --git a/src/cleaners/date-published.js b/src/cleaners/date-published.js index c8e196e2..79da99c8 100644 --- a/src/cleaners/date-published.js +++ b/src/cleaners/date-published.js @@ -4,6 +4,8 @@ import moment from 'moment'; // but could just check for 'Invalid Date' string. import { + MS_DATE_STRING, + SEC_DATE_STRING, CLEAN_DATE_STRING_RE, SPLIT_DATE_STRING, TIME_MERIDIAN_SPACE_RE, @@ -22,6 +24,11 @@ export function cleanDateString(dateString) { // Take a date published string, and hopefully return a date out of // it. Return none if we fail. export default function cleanDatePublished(dateString) { + // If string is in milliseconds or seconds, convert to int + if (MS_DATE_STRING.test(dateString) || SEC_DATE_STRING.test(dateString)) { + dateString = parseInt(dateString, 10) + } + let date = moment(new Date(dateString)); if (!date.isValid()) { diff --git a/src/extractors/custom/twitter.com/index.js b/src/extractors/custom/twitter.com/index.js index 88fc4ee5..963d71b7 100644 --- a/src/extractors/custom/twitter.com/index.js +++ b/src/extractors/custom/twitter.com/index.js @@ -40,7 +40,8 @@ const TwitterExtractor = { date_published: { selectors: [ - '.tweet.permalink-tweet .metadata', + '.permalink-tweet ._timestamp[data-time-ms]', + // '.tweet.permalink-tweet .metadata', ], }, diff --git a/src/extractors/custom/twitter.com/index.test.js b/src/extractors/custom/twitter.com/index.test.js index 7441fed4..b4c62543 100644 --- a/src/extractors/custom/twitter.com/index.test.js +++ b/src/extractors/custom/twitter.com/index.test.js @@ -12,7 +12,7 @@ describe('TwitterExtractor', () => { assert.equal(title, 'Lina Morgana on Twitter'); assert.equal(author, '@KingBeyonceStan'); - assert.equal(date_published, '2016-06-21T08:27:00.000Z'); + assert.equal(date_published, '2016-06-21T15:27:25.000Z'); }); }); diff --git a/src/extractors/generic/index.test.js b/src/extractors/generic/index.test.js index 76c50c37..fa9e9ef1 100644 --- a/src/extractors/generic/index.test.js +++ b/src/extractors/generic/index.test.js @@ -23,8 +23,8 @@ describe('GenericExtractor', () => { 'California appears poised to be first to ban power-guzzling big-screen TVs' ); assert.equal( - date_published, - '2009-10-14T04:00:00.000Z' + date_published.split('T')[0], + '2009-10-14' ); assert.equal(dek, null); });