dx: remove unnec comments in source (#205)

* dx: remove commented code and obvious comments that can be looked up

* dx: remove commented out eslint options

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove test block as all its code was commented out

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove regex example comments

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out import

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* dx: remove commented out code

* chore: remove empty files

* chore: re-prettier code that may have missed it

* added back nec comments
pull/204/head^2
George Haddad 5 years ago committed by GitHub
parent e2dbd08ae7
commit 56badb51f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,5 +1,3 @@
// Use this file as a starting point for your project's .eslintrc.
// Copy this file, and add rule overrides as needed.
{
"parser": "babel-eslint",
"extends": ["airbnb", "prettier"],
@ -7,7 +5,6 @@
"babel"
],
"globals": {
/* mocha */
"describe",
"it",
"fit",
@ -23,7 +20,6 @@
"generator-star-spacing": 0,
"babel/generator-star-spacing": 0,
"func-names": 0,
// "no-useless-escape": 0,
"no-confusing-arrow": 0,
"camelcase": 0,
"no-multiple-empty-lines": [

@ -1,33 +1,19 @@
// Karma configuration
// Generated on Mon Nov 14 2016 10:21:57 GMT-0800 (PST)
// if (process.env.CI) {
// require('phantomjs-prebuilt').path = './node_modules/.bin/phantomjs';
// }
module.exports = function (config) {
config.set({
// base path that will be used to resolve all patterns (eg. files, exclude)
basePath: '',
// frameworks to use
// available frameworks: https://npmjs.org/browse/keyword/karma-adapter
frameworks: ['jasmine', 'browserify'],
// list of files / patterns to load in the browser
files: [
// 'test-main.js',
'./node_modules/phantomjs-polyfill-find/find-polyfill.js',
'./node_modules/phantomjs-polyfill-string-includes/index.js',
{ pattern: 'src/**/*.test.js', included: true },
],
// list of files to exclude
exclude: [
],
// preprocess matching files before serving them to the browser
// available preprocessors: https://npmjs.org/browse/keyword/karma-preprocessor
preprocessors: {
'src/**/*.js': ['browserify'],
},
@ -40,35 +26,13 @@ module.exports = function (config) {
],
},
// test results reporter to use
// possible values: 'dots', 'progress'
// available reporters: https://npmjs.org/browse/keyword/karma-reporter
reporters: ['progress'],
// web server port
port: 9876,
// enable / disable colors in the output (reporters and logs)
colors: true,
// level of logging
// possible values: config.LOG_DISABLE || config.LOG_ERROR || config.LOG_WARN || config.LOG_INFO || config.LOG_DEBUG
logLevel: config.LOG_INFO,
// enable / disable watching file and executing tests whenever any file changes
autoWatch: false,
// start these browsers
// available browser launchers: https://npmjs.org/browse/keyword/karma-launcher
// browsers: ['PhantomJS'],
browsers: [(process.env.CI ? 'PhantomJS' : 'Chrome')],
// Continuous Integration mode
// if true, Karma captures browsers, runs the tests and exits
singleRun: true,
// Concurrency level
// how many browser should be started simultaneous
concurrency: Infinity,
});
};

@ -1,6 +1,5 @@
// CLEAN AUTHOR CONSTANTS
export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
// author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',
// CLEAN DEK CONSTANTS
export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');

@ -10,11 +10,7 @@ describe('cleanDatePublished(dateString)', () => {
it('returns a date', () => {
const datePublished = cleanDatePublished('published: 1/1/2020');
assert.equal(
datePublished,
moment('1/1/2020', 'MM/DD/YYYY').toISOString()
// '2020-01-01T05:00:00.000Z',
);
assert.equal(datePublished, moment('1/1/2020', 'MM/DD/YYYY').toISOString());
});
it('returns null if date is invalid', () => {

@ -5,13 +5,6 @@ import HTML from './fixtures/html';
import { cleanTitle } from './index';
describe('cleanTitle(title, { url, $ })', () => {
it('uses a single h1 if the title is too short or too long', () => {
// const title = "Too Short"
// const $ = cheerio.load(HTML.docWithH1)
//
// assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
});
it('only uses h1 if there is only one on the page', () => {
const title = 'Too Short';
const $ = cheerio.load(HTML.docWith2H1s);

@ -1,12 +0,0 @@
// import assert from 'assert';
// import fs from 'fs';
// import cheerio from 'cheerio';
//
// import collectAllPages from './collect-all-pages';
//
describe('collectAllPages(opts)', () => {
it('fetches additional pages', () => {
// const html = fs.readFileSync('./fixtures/ars.html');
// const $ = cheerio.load(html);
});
});

@ -59,16 +59,6 @@ describe('ForwardComExtractor', () => {
assert.equal(date_published, '2016-12-28T20:32:00.000Z');
});
// it('returns the dek', async () => {
// // To pass this test, fill out the dek selector
// // in ./src/extractors/custom/forward.com/index.js.
// const { dek } = await result;
//
// // Update these values with the expected values from
// // the article.
// assert.equal(dek, '');
// });
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/forward.com/index.js.

@ -31,9 +31,6 @@ export const TwitterExtractor = {
},
date_published: {
selectors: [
['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms'],
// '.tweet.permalink-tweet .metadata',
],
selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']],
},
};

@ -31,12 +31,7 @@ export const WwwHuffingtonpostComExtractor = {
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images
transforms: {
// 'div.top-media': ($node) => {
// const $figure = $node.children('figure');
// $node.replaceWith($figure);
// },
},
transforms: {},
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from

@ -15,15 +15,6 @@ export const NYTimesExtractor = {
transforms: {
'img.g-lazy': $node => {
let src = $node.attr('src');
// const widths = $node.attr('data-widths')
// .slice(1)
// .slice(0, -1)
// .split(',');
// if (widths.length) {
// width = widths.slice(-1);
// } else {
// width = '900';
// }
const width = 640;
src = src.replace('{{size}}', width);

@ -24,10 +24,7 @@ export const WwwProspectmagazineCoUkExtractor = {
},
content: {
selectors: [
// ['article.type-post div.post_content p'],
'article .post_content',
],
selectors: ['article .post_content'],
// Is there anything in the content you selected that needs transformed
// before it's consumable content? E.g., unusual lazy loaded images

@ -29,10 +29,6 @@ export const WwwReutersComExtractor = {
// Is there anything that is in the result that shouldn't be?
// The clean selectors will remove anything that matches from
// the result
clean: [
'#article-byline .author',
// 'span.location',
// 'span.articleLocation',
],
clean: ['#article-byline .author'],
},
};

@ -61,22 +61,6 @@ describe('WwwTmzComExtractor', () => {
assert.equal(date_published, '2016-11-28T11:00:00.000Z');
});
// it('returns the dek', async () => {
// // To pass this test, fill out the dek selector
// // in ./src/extractors/custom/www.tmz.com/index.js.
// const html =
// fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
// const articleUrl =
// 'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
//
// const { dek } =
// await Mercury.parse(articleUrl, html, { fallback: false });
//
// // Update these values with the expected values from
// // the article.
// assert.equal(dek, '');
// });
it('returns the lead_image_url', async () => {
// To pass this test, fill out the lead_image_url selector
// in ./src/extractors/custom/www.tmz.com/index.js.

@ -14,11 +14,6 @@ import { scoreContent, findTopCandidate } from './scoring';
//
// Returns a cheerio object $
export default function extractBestNode($, opts) {
// clone the node so we can get back to our
// initial parsed state if needed
// TODO Do I need this? AP
// let $root = $.root().clone()
if (opts.stripUnlikelyCandidates) {
$ = stripUnlikelyCandidates($);
}

@ -1,9 +1,6 @@
import assert from 'assert';
import cheerio from 'cheerio';
import fs from 'fs';
// import HTML from './fixtures/html'
import extractBestNode from './extract-best-node';
describe('extractBestNode($, flags)', () => {

@ -80,11 +80,6 @@ const GenericContentExtractor = {
}
return normalizeSpaces($.html(node));
// if return_type == "html":
// return normalize_spaces(node_to_html(node))
// else:
// return node
},
};

@ -9,13 +9,6 @@ describe('GenericContentExtractor', () => {
describe('extract($, html, opts)', () => {
it('extracts html and returns the article', () => {
const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
// Array.from(range(1, 100)).map((i) => {
// console.log(i)
// clean(GenericContentExtractor.extract(
// { $: null, html, url: 'http://example.com' }
// ))
// })
const result = clean(
GenericContentExtractor.extract({
$: null,
@ -26,7 +19,6 @@ describe('GenericContentExtractor', () => {
);
assert(typeof result, 'string');
// console.log(result)
});
});
});

@ -49,12 +49,7 @@ export const DATE_PUBLISHED_SELECTORS = [
// reference be a date string that is parseable by dateutil.parser.parse
const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
export const DATE_PUBLISHED_URL_RES = [
// /2012/01/27/ but not /2012/01/293
new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
// 20120127 or 20120127T but not 2012012733 or 8201201733
// /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
// 2012-01-27
new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
// /2012/jan/27/
new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'),
];

@ -1,51 +1,11 @@
// import {
// DEK_META_TAGS,
// DEK_SELECTORS,
// DEK_URL_RES,
// } from './constants';
// import { cleanDek } from 'cleaners';
// import {
// extractFromMeta,
// extractFromSelectors,
// } from 'utils/dom';
// Currently there is only one selector for
// deks. We should simply return null here
// until we have a more robust generic option.
// Below is the original source for this, for reference.
const GenericDekExtractor = {
// extract({ $, content, metaCache }) {
extract() {
return null;
},
};
export default GenericDekExtractor;
// def extract_dek(self):
// # First, check to see if we have a matching meta tag that we can make
// # use of.
// dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
// if not dek:
// # Second, look through our CSS/XPath selectors. This may return
// # an HTML fragment.
// dek = self.extract_from_selectors('dek',
// constants.DEK_SELECTORS,
// text_only=False)
//
// if dek:
// # Make sure our dek isn't in the first few thousand characters
// # of the content, otherwise it's just the start of the article
// # and not a true dek.
// content = self.extract_content()
// content_chunk = normalize_spaces(strip_tags(content[:2000]))
// dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
//
// # 80% or greater similarity means the dek was very similar to some
// # of the starting content, so we skip it.
// if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
// return dek
//
// return None

@ -111,158 +111,3 @@ const GenericLeadImageUrlExtractor = {
};
export default GenericLeadImageUrlExtractor;
// def extract(self):
// """
// # First, try to find the "best" image via the content.
// # We'd rather not have to fetch each image and check dimensions,
// # so try to do some analysis and determine them instead.
// content = self.extractor.extract_content(return_type="node")
// imgs = content.xpath('.//img')
// img_scores = defaultdict(int)
// logger.debug('Scoring %d images from content', len(imgs))
// for (i, img) in enumerate(imgs):
// img_score = 0
//
// if not 'src' in img.attrib:
// logger.debug('No src attribute found')
// continue
//
// try:
// parsed_img = urlparse(img.attrib['src'])
// img_path = parsed_img.path.lower()
// except ValueError:
// logger.debug('ValueError getting img path.')
// continue
// logger.debug('Image path is %s', img_path)
//
// if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
// logger.debug('Positive URL hints match. Adding 20.')
// img_score += 20
//
// if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
// logger.debug('Negative URL hints match. Subtracting 20.')
// img_score -= 20
//
// # Gifs are more often structure than photos
// if img_path.endswith('gif'):
// logger.debug('gif found. Subtracting 10.')
// img_score -= 10
//
// # JPGs are more often photographs
// if img_path.endswith('jpg'):
// logger.debug('jpg found. Adding 10.')
// img_score += 10
//
// # PNGs are neutral.
//
// # Alt attribute usually means non-presentational image.
// if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
// logger.debug('alt attribute found. Adding 5.')
// img_score += 5
//
// # Look through our parent and grandparent for figure-like
// # container elements, give a bonus if we find them
// parents = [img.getparent()]
// if parents[0] is not None and parents[0].getparent() is not None:
// parents.append(parents[0].getparent())
// for p in parents:
// if p.tag == 'figure':
// logger.debug('Parent with <figure> tag found. Adding 25.')
// img_score += 25
//
// p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
// if constants.PHOTO_HINTS_RE.search(p_sig):
// logger.debug('Photo hints regex match. Adding 15.')
// img_score += 15
//
// # Look at our immediate sibling and see if it looks like it's a
// # caption. Bonus if so.
// sibling = img.getnext()
// if sibling is not None:
// if sibling.tag == 'figcaption':
// img_score += 25
//
// sib_sig = ' '.join([sibling.get('id', ''),
// sibling.get('class', '')]).lower()
// if 'caption' in sib_sig:
// img_score += 15
//
// # Pull out width/height if they were set.
// img_width = None
// img_height = None
// if 'width' in img.attrib:
// try:
// img_width = float(img.get('width'))
// except ValueError:
// pass
// if 'height' in img.attrib:
// try:
// img_height = float(img.get('height'))
// except ValueError:
// pass
//
// # Penalty for skinny images
// if img_width and img_width <= 50:
// logger.debug('Skinny image found. Subtracting 50.')
// img_score -= 50
//
// # Penalty for short images
// if img_height and img_height <= 50:
// # Wide, short images are more common than narrow, tall ones
// logger.debug('Short image found. Subtracting 25.')
// img_score -= 25
//
// if img_width and img_height and not 'sprite' in img_path:
// area = img_width * img_height
//
// if area < 5000: # Smaller than 50x100
// logger.debug('Image with small area found. Subtracting 100.')
// img_score -= 100
// else:
// img_score += round(area/1000.0)
//
// # If the image is higher on the page than other images,
// # it gets a bonus. Penalty if lower.
// logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
// img_score += len(imgs)/2 - i
//
// # Use the raw src here because we munged img_path for case
// # insensitivity
// logger.debug('Final score is %d.', img_score)
// img_scores[img.attrib['src']] += img_score
//
// top_score = 0
// top_url = None
// for (url, score) in img_scores.items():
// if score > top_score:
// top_url = url
// top_score = score
//
// if top_score > 0:
// logger.debug('Using top score image from content. Score was %d', top_score)
// return top_url
//
//
// # If nothing else worked, check to see if there are any really
// # probable nodes in the doc, like <link rel="image_src" />.
// logger.debug('Trying to find lead image in probable nodes')
// for selector in constants.LEAD_IMAGE_URL_SELECTORS:
// nodes = self.resource.extract_by_selector(selector)
// for node in nodes:
// clean_value = None
// if node.attrib.get('src'):
// clean_value = self.clean(node.attrib['src'])
//
// if not clean_value and node.attrib.get('href'):
// clean_value = self.clean(node.attrib['href'])
//
// if not clean_value and node.attrib.get('value'):
// clean_value = self.clean(node.attrib['value'])
//
// if clean_value:
// logger.debug('Found lead image in probable nodes.')
// logger.debug('Node was: %s', node)
// return clean_value
//
// return None

@ -1,8 +1,8 @@
import assert from 'assert';
import nock from 'nock'; // eslint-disable-line import/no-extraneous-dependencies
// import fs from 'fs';
import path from 'path';
import cheerio from 'cheerio';
// import fs from 'fs';
export function clean(string) {
return string
@ -26,7 +26,6 @@ export function record(name, options = {}) {
let has_fixtures = !!process.env.NOCK_RECORD;
return {
// starts recording, or ensure the fixtures exist
before: () => {
if (cheerio.browser) return;
if (!has_fixtures) {
@ -45,18 +44,19 @@ export function record(name, options = {}) {
});
}
},
// saves our recording if fixtures didn't already exist
after: done => {
if (!has_fixtures && !cheerio.browser) {
has_fixtures = nock.recorder.play();
// eslint-disable-next-line no-console
console.log(
`This is disabled for browser/node interop. To capture fixutres,
open ${'`src/test-helpers.js`'} and comment out lines 55 and 56 and
uncomment fs import at top of file.`
open ${'`src/test-helpers.js`'} and comment out lines 57 and 58 and
uncomment the fs import at top of file.`
);
// const text = `const nock = require('nock');\n${has_fixtures.join('\n')}`;
// fs.writeFile(fp, text, done);
} else {
done();
}

@ -20,7 +20,6 @@ export default function brsToPs($) {
$element.remove();
} else if (collapsing) {
collapsing = false;
// $(element).replaceWith('<p />')
paragraphize(element, $, true);
}
});

@ -24,12 +24,6 @@ function removeAllButWhitelist($article, $) {
return $article;
}
// function removeAttrs(article, $) {
// REMOVE_ATTRS.forEach((attr) => {
// $(`[${attr}]`, article).removeAttr(attr);
// });
// }
// Remove attributes like style or align
export default function cleanAttributes($article, $) {
// Grabbing the parent because at this point

@ -6,7 +6,6 @@ export default function convertNodeTo($node, $, tag = 'p') {
return $;
}
const attrs = getAttrs(node) || {};
// console.log(attrs)
const attribString = Reflect.ownKeys(attrs)
.map(key => `${key}=${attrs[key]}`)

Loading…
Cancel
Save