You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
mercury-parser/src/cleaners/content.test.js

35 lines
905 B
JavaScript

import assert from 'assert';
import cheerio from 'cheerio';
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
import extractBestNode from 'extractors/generic/content/extract-best-node';
import extractCleanNode from './content';
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
const fs = require('fs');
describe('extractCleanNode(article, { $, cleanConditionally, title } })', () => {
it('cleans cruft out of a DOM node', () => {
feat: update all fixtures and custom parsers to match (#713) * feat: Refactor and update fixtures This patch changes how fixtures are stored. Previously, a fixture's folder identified its domain and its filename identified when it was fetched. This has been changed so that the filename indicates the domain and the modified time of the file indicates how recently it was fetched. A fixture's filename can optionally include a modifier to distinguish between two different page types on the same domain, for example. Also included here are changes to the update-fixture script, both to accomodate the new filename scheme as well as to actually update all fixtures. The functionality for running automatically and opening PRs has been removed but will likely be reintroduced. Finally, all fixtures have been updated. * Remove reference to deleted extractor * feat: first batch of test and parser updates due to new fixtures * feat: update more custom parsers and unit tests * feat: update more custom parsers and unit tests and remove unnecessary parser * feat: update more custom parsers and unit tests * feat: update more parsers and add correct bloomberg html files * fix: remove console statement * feat: all parsers updated and tests passing * fix: update date_published tests to account for test server time difference * fix: cleanup remaining fixtures in folders * feat: move fixtures for newest custom parsers * feat: remove script changes * fix: update dist files to account for reverting script changes * adding .DS_Store to .gitignore * adding .DS_Store to .gitignore -- 2 * adding .DS_Store to .gitignore -- 3 lol * cleaning up some tests * fix: ran build:generator command to update generate-custom-parser dist file * fix: update rollup configs to generate source maps and update source maps * fix: use underscore in place of unused error variable * fix: remove unused fixture Co-authored-by: Postlight Bot <adam.pash+postlight-bot@postlight.com> Co-authored-by: flbn <overasc@gmail.com>
2 years ago
const html = fs.readFileSync(
'./fixtures/www.wired.com--content-test.html',
'utf-8'
);
const $ = cheerio.load(html);
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
const opts = {
stripUnlikelyCandidates: true,
weightNodes: true,
cleanConditionally: true,
};
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
const bestNode = extractBestNode($, opts);
const cleanNode = extractCleanNode(bestNode, { $, opts });
feat: implemented extractBestNode functionality Squashed commit of the following: commit 9af554dd975ff1778ed70c71fa9bde667fc5f880 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 15:19:32 2016 -0400 feat: add cleanHeaders commit 0dfea98eedc4f97fcbd78866322595c705e20521 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 14:30:49 2016 -0400 fix: scoring parent nodes recursively commit b6e5897a694adeb81e25a905aba72c0f45a8cc94 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 12:47:24 2016 -0400 feat: extract clean node up and running commit fb652c5db13db6bce7271efd68ba4b20515e9549 Author: Adam Pash <adam.pash@gmail.com> Date: Tue Aug 30 09:57:21 2016 -0400 chore: added test for p tags with nested tags (e.g., img, iframe) commit 731d0a2e4d89121dfafad195e9d0911805c4f8e4 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 17:50:33 2016 -0400 feat: extact clean node integrates most functions commit 322bc6534d30feb7c1c08d3813132badc6286b40 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:46:04 2016 -0400 feat: removing empty nodes as defined in constants commit f1d38932ea12a865814d2326970031fcb8515baa Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:33:31 2016 -0400 feat: cleaning attributes from nodes commit 0aa73ada6854af0ecd504bfe3d926a9524787ab5 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 16:09:56 2016 -0400 feat: cleaning h1s from text commit 12d4a309246285c278ce7765e4fbaa8271bb5889 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:52:03 2016 -0400 feat: removing spacer images commit 4e74ff830cc67586560f6fc72e2cfa432a3a2647 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:38:49 2016 -0400 feat: stripping unwanted html from doc commit c774166e90169fd0c1aa89898d3f7a975e82bf0a Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 15:17:32 2016 -0400 feat: removing small images, height attribute from images commit 3a8642f42cda451669c832482c5e1611b1ff2ea9 Author: Adam Pash <adam.pash@gmail.com> Date: Mon Aug 29 12:57:45 2016 -0400 feat: rewrite top level commit a1c03e779234b0aea02206d92ec3dcc15758507e Author: Adam Pash <adam.pash@gmail.com> Date: Fri Aug 26 17:34:36 2016 -0400 in a weird place rn
8 years ago
const text = $(cleanNode)
.text()
.replace(/\n/g, '')
.replace(/\s+/g, ' ')
.trim();
assert.equal(text.length === 2656 || text.length === 2657, true);
});
});