From bc971567189bd58960fb1f9dd2c4e7c3c712580f Mon Sep 17 00:00:00 2001 From: Adam Pash Date: Tue, 6 Sep 2016 10:00:43 -0400 Subject: [PATCH] fix: better scoring for iamge extensions --- TODO.md | 3 +-- src/extractor/generic/lead-image-url/constants.js | 4 ++-- src/extractor/generic/lead-image-url/score-image.test.js | 3 +++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/TODO.md b/TODO.md index aebeb01f..314a51be 100644 --- a/TODO.md +++ b/TODO.md @@ -1,8 +1,6 @@ TODO: -- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff - Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc) - extractNextPageUrl -- Try Closure webpack compiler - Rename all cleaners from cleanThing to clean - Make sure weightNodes flag is being passed properly - Get better sense of when cheerio returns a raw node and when a cheerio object @@ -12,6 +10,7 @@ TODO: - Separate constants into activity-specific folders (dom, scoring) DONE: +x Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff x extractLeadImageUrl x extractDek x extractDatePublished diff --git a/src/extractor/generic/lead-image-url/constants.js b/src/extractor/generic/lead-image-url/constants.js index 6148c870..59044f4e 100644 --- a/src/extractor/generic/lead-image-url/constants.js +++ b/src/extractor/generic/lead-image-url/constants.js @@ -49,5 +49,5 @@ export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [ ] export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') -export const GIF_RE = /\gif$/i -export const JPG_RE = /\jpe?g$/i +export const GIF_RE = /\.gif(\?.*)?$/i +export const JPG_RE = /\.jpe?g(\?.*)?$/i diff --git a/src/extractor/generic/lead-image-url/score-image.test.js b/src/extractor/generic/lead-image-url/score-image.test.js index 2315cb25..c17d71a6 100644 --- a/src/extractor/generic/lead-image-url/score-image.test.js +++ b/src/extractor/generic/lead-image-url/score-image.test.js @@ -42,6 +42,9 @@ describe('scoreImageUrlUrl(url)', () => { const url3 = 'http://example.com/foojpg/bar' assert.equal(scoreImageUrl(url3), 0) + + const url4 = 'http://example.com/foo.jpg?bar=baz' + assert.equal(scoreImageUrl(url4), 10) }) })