diff --git a/TODO.md b/TODO.md index 91e5bae8..4b068c95 100644 --- a/TODO.md +++ b/TODO.md @@ -1,5 +1,4 @@ TODO: -- remove logic for fetching meta attrs with custom props - extractNextPageUrl - Rename all cleaners from cleanThing to clean - Make sure weightNodes flag is being passed properly @@ -10,6 +9,7 @@ TODO: - Separate constants into activity-specific folders (dom, scoring) DONE: +x remove logic for fetching meta attrs with custom props x cleaning embed and object nodes x run makeLinksAbsolute on extracted content before returning x add option to fetch attrs in RootExtractor's select method diff --git a/src/extractors/generic/index.test.js b/src/extractors/generic/index.test.js index 84f802fd..4d463593 100644 --- a/src/extractors/generic/index.test.js +++ b/src/extractors/generic/index.test.js @@ -15,7 +15,6 @@ describe('GenericExtractor', () => { author, datePublished, dek, - leadImageUrl, } = GenericExtractor.extract( { url: "http://latimes.com", html, metaCache: [] } ) @@ -30,7 +29,6 @@ describe('GenericExtractor', () => { '2009-10-14T04:00:00.000Z' ) assert.equal(dek, null) - assert.equal(leadImageUrl, 'http://latimesblogs.latimes.com/fb.jpg') }) it("extracts html and returns the article title", () => { @@ -41,7 +39,6 @@ describe('GenericExtractor', () => { title, datePublished, dek, - leadImageUrl, } = GenericExtractor.extract( { url: "http://wired.com", html, metaCache: [] } ) @@ -53,7 +50,6 @@ describe('GenericExtractor', () => { ) assert.equal(datePublished, null) assert.equal(dek, null) - assert.equal(leadImageUrl, 'https://www.wired.com/wp-content/uploads/2016/08/GettyImages-536814811-1200x630-e1471497753973.jpg') }) }) diff --git a/src/extractors/generic/lead-image-url/constants.js b/src/extractors/generic/lead-image-url/constants.js index 59044f4e..07556786 100644 --- a/src/extractors/generic/lead-image-url/constants.js +++ b/src/extractors/generic/lead-image-url/constants.js @@ -2,8 +2,8 @@ // All attributes should be lowercase for faster case-insensitive matching. // From most distinct to least distinct. export const LEAD_IMAGE_URL_META_TAGS = [ - ['og:image', 'property', 'content'], - ['twitter:image', 'name', 'content'], + 'og:image', + 'twitter:image', 'image_src', ] diff --git a/src/extractors/generic/lead-image-url/fixtures/html.js b/src/extractors/generic/lead-image-url/fixtures/html.js index 3b8a95ca..2af7b519 100644 --- a/src/extractors/generic/lead-image-url/fixtures/html.js +++ b/src/extractors/generic/lead-image-url/fixtures/html.js @@ -3,7 +3,7 @@ const HTML = { test: ` - + `, @@ -13,7 +13,7 @@ const HTML = { test: ` - + `, diff --git a/src/utils/dom/extract-from-meta.js b/src/utils/dom/extract-from-meta.js index a6a8c2ad..41953e21 100644 --- a/src/utils/dom/extract-from-meta.js +++ b/src/utils/dom/extract-from-meta.js @@ -15,26 +15,14 @@ export default function extractFromMeta( cleanTags=true, ) { const foundNames = metaNames.filter(name => { - const metaType = typeof name - - if (metaType === 'string') { - return cachedNames.indexOf(name) !== -1 - } else if (metaType === 'object') { - return cachedNames.indexOf(name[0]) !== 1 - } + return cachedNames.indexOf(name) !== -1 }) for (let name of foundNames) { let type, value - if (typeof name === 'string') { - type = 'name' - value = 'value' - } else { - type = name[1] - value = name[2] - name = name[0] - } + type = 'name' + value = 'value' const nodes = $(`meta[${type}="${name}"]`) diff --git a/src/utils/dom/extract-from-meta.test.js b/src/utils/dom/extract-from-meta.test.js index 615a12fd..5c631a97 100644 --- a/src/utils/dom/extract-from-meta.test.js +++ b/src/utils/dom/extract-from-meta.test.js @@ -32,16 +32,6 @@ describe('extractFromMeta($, metaNames, cachedNames, cleanTags)', () => { assert.equal(result, HTML.metaEmptyDupes.result) }) - it('accepts custom attributes', () => { - const $ = cheerio.load(HTML.custom.test) - const metaNames = [['foo', 'property', 'content']] - const cachedNames = ['foo'] - const result = extractFromMeta( - $, metaNames, cachedNames - ) - - assert.equal(result, HTML.custom.result) - }) })