mirror of
https://github.com/postlight/mercury-parser
synced 2024-11-17 03:25:31 +00:00
fix: better scoring for iamge extensions
This commit is contained in:
parent
11a2286659
commit
bc97156718
3
TODO.md
3
TODO.md
@ -1,8 +1,6 @@
|
||||
TODO:
|
||||
- Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
|
||||
- Resource (fetches page, validates it, cleans it, normalizes meta tags (!), converts lazy-loaded images, makes links absolute, etc)
|
||||
- extractNextPageUrl
|
||||
- Try Closure webpack compiler
|
||||
- Rename all cleaners from cleanThing to clean
|
||||
- Make sure weightNodes flag is being passed properly
|
||||
- Get better sense of when cheerio returns a raw node and when a cheerio object
|
||||
@ -12,6 +10,7 @@ TODO:
|
||||
- Separate constants into activity-specific folders (dom, scoring)
|
||||
|
||||
DONE:
|
||||
x Check that lead-image-url extractor isn't looking for end-of-string file extension matches (i.e., it could be ...foo.jpg?otherstuff
|
||||
x extractLeadImageUrl
|
||||
x extractDek
|
||||
x extractDatePublished
|
||||
|
@ -49,5 +49,5 @@ export const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
|
||||
]
|
||||
export const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
|
||||
|
||||
export const GIF_RE = /\gif$/i
|
||||
export const JPG_RE = /\jpe?g$/i
|
||||
export const GIF_RE = /\.gif(\?.*)?$/i
|
||||
export const JPG_RE = /\.jpe?g(\?.*)?$/i
|
||||
|
@ -42,6 +42,9 @@ describe('scoreImageUrlUrl(url)', () => {
|
||||
|
||||
const url3 = 'http://example.com/foojpg/bar'
|
||||
assert.equal(scoreImageUrl(url3), 0)
|
||||
|
||||
const url4 = 'http://example.com/foo.jpg?bar=baz'
|
||||
assert.equal(scoreImageUrl(url4), 10)
|
||||
})
|
||||
})
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user