fix: better scoring for iamge extensions

This commit is contained in:
Adam Pash 2016-09-06 10:00:43 -04:00
parent 11a2286659
commit bc97156718
3 changed files with 6 additions and 4 deletions

View File

@ -1,8 +1,6 @@
TODO: TODO:
- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc) - Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
- extractNextPageUrl - extractNextPageUrl
- Try Closure webpack compiler
- Rename all cleaners from cleanThing to clean - Rename all cleaners from cleanThing to clean
- Make sure weightNodes flag is being passed properly - Make sure weightNodes flag is being passed properly
- Get better sense of when cheerio returns a raw node and when a cheerio object - Get better sense of when cheerio returns a raw node and when a cheerio object
@ -12,6 +10,7 @@ TODO:
- Separate constants into activity-specific folders (dom, scoring) - Separate constants into activity-specific folders (dom, scoring)
DONE: DONE:
x Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
x extractLeadImageUrl x extractLeadImageUrl
x extractDek x extractDek
x extractDatePublished x extractDatePublished

View File

@ -49,5 +49,5 @@ export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
] ]
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i') export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
export const GIF_RE = /\gif$/i export const GIF_RE = /\.gif(\?.*)?$/i
export const JPG_RE = /\jpe?g$/i export const JPG_RE = /\.jpe?g(\?.*)?$/i

View File

@ -42,6 +42,9 @@ describe('scoreImageUrlUrl(url)', () => {
const url3 = 'http://example.com/foojpg/bar' const url3 = 'http://example.com/foojpg/bar'
assert.equal(scoreImageUrl(url3), 0) assert.equal(scoreImageUrl(url3), 0)
const url4 = 'http://example.com/foo.jpg?bar=baz'
assert.equal(scoreImageUrl(url4), 10)
}) })
}) })