dx: remove unnec comments in source (#205)

* dx: remove commented code and obvious comments that can be looked up * dx: remove commented out eslint options * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove test block as all its code was commented out * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove regex example comments * dx: remove commented out code * dx: remove commented out code * dx: remove commented out import * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * dx: remove commented out code * chore: remove empty files * chore: re-prettier code that may have missed it * added back nec comments
5 years ago · 56badb51f5
parent e2dbd08ae7
commit 56badb51f5
25 changed files with 10 additions and 353 deletions
--- a/.eslintrc
+++ b/.eslintrc
@ -1,5 +1,3 @@
-// Use this file as a starting point for your project's .eslintrc.
-// Copy this file, and add rule overrides as needed.
 {
  "parser": "babel-eslint",
  "extends": ["airbnb", "prettier"],
@ -7,7 +5,6 @@
    "babel"
  ],
  "globals": {
-    /* mocha */
    "describe",
    "it",
    "fit",
@ -23,7 +20,6 @@
    "generator-star-spacing": 0,
    "babel/generator-star-spacing": 0,
    "func-names": 0,
-    // "no-useless-escape": 0,
    "no-confusing-arrow": 0,
    "camelcase": 0,
    "no-multiple-empty-lines": [
--- a/karma.conf.js
+++ b/karma.conf.js
@ -1,33 +1,19 @@
-// Karma configuration
-// Generated on Mon Nov 14 2016 10:21:57 GMT-0800 (PST)
-// if (process.env.CI) {
-//   require('phantomjs-prebuilt').path = './node_modules/.bin/phantomjs';
-// }
-
 module.exports = function (config) {
  config.set({

-    // base path that will be used to resolve all patterns (eg. files, exclude)
    basePath: '',

-    // frameworks to use
-    // available frameworks: https://npmjs.org/browse/keyword/karma-adapter
    frameworks: ['jasmine', 'browserify'],

-    // list of files / patterns to load in the browser
    files: [
-      // 'test-main.js',
      './node_modules/phantomjs-polyfill-find/find-polyfill.js',
      './node_modules/phantomjs-polyfill-string-includes/index.js',
      { pattern: 'src/**/*.test.js', included: true },
    ],

-    // list of files to exclude
    exclude: [
    ],

-    // preprocess matching files before serving them to the browser
-    // available preprocessors: https://npmjs.org/browse/keyword/karma-preprocessor
    preprocessors: {
      'src/**/*.js': ['browserify'],
    },
@ -40,35 +26,13 @@ module.exports = function (config) {
      ],
    },

-    // test results reporter to use
-    // possible values: 'dots', 'progress'
-    // available reporters: https://npmjs.org/browse/keyword/karma-reporter
    reporters: ['progress'],
-
-    // web server port
    port: 9876,
-
-    // enable / disable colors in the output (reporters and logs)
    colors: true,
-
-    // level of logging
-    // possible values: config.LOG_DISABLE || config.LOG_ERROR || config.LOG_WARN || config.LOG_INFO || config.LOG_DEBUG
    logLevel: config.LOG_INFO,
-
-    // enable / disable watching file and executing tests whenever any file changes
    autoWatch: false,
-
-    // start these browsers
-    // available browser launchers: https://npmjs.org/browse/keyword/karma-launcher
-    // browsers: ['PhantomJS'],
    browsers: [(process.env.CI ? 'PhantomJS' : 'Chrome')],
-
-    // Continuous Integration mode
-    // if true, Karma captures browsers, runs the tests and exits
    singleRun: true,
-
-    // Concurrency level
-    // how many browser should be started simultaneous
    concurrency: Infinity,
  });
 };
--- a/src/cleaners/constants.js
+++ b/src/cleaners/constants.js
@ -1,6 +1,5 @@
 // CLEAN AUTHOR CONSTANTS
 export const CLEAN_AUTHOR_RE = /^\s*(posted |written )?by\s*:?\s*(.*)/i;
-//     author = re.sub(r'^\s*(posted |written )?by\s*:?\s*(.*)(?i)',

 // CLEAN DEK CONSTANTS
 export const TEXT_LINK_RE = new RegExp('http(s)?://', 'i');
--- a/src/cleaners/date-published.test.js
+++ b/src/cleaners/date-published.test.js
@ -10,11 +10,7 @@ describe('cleanDatePublished(dateString)', () => {
  it('returns a date', () => {
    const datePublished = cleanDatePublished('published: 1/1/2020');

-    assert.equal(
-      datePublished,
-      moment('1/1/2020', 'MM/DD/YYYY').toISOString()
-      // '2020-01-01T05:00:00.000Z',
-    );
+    assert.equal(datePublished, moment('1/1/2020', 'MM/DD/YYYY').toISOString());
  });

  it('returns null if date is invalid', () => {
--- a/src/cleaners/title.test.js
+++ b/src/cleaners/title.test.js
@ -5,13 +5,6 @@ import HTML from './fixtures/html';
 import { cleanTitle } from './index';

 describe('cleanTitle(title, { url, $ })', () => {
-  it('uses a single h1 if the title is too short or too long', () => {
-    // const title = "Too Short"
-    // const $ = cheerio.load(HTML.docWithH1)
-    //
-    // assert.equal(cleanTitle(title, { url: '', $ }), $('h1').text())
-  });
-
  it('only uses h1 if there is only one on the page', () => {
    const title = 'Too Short';
    const $ = cheerio.load(HTML.docWith2H1s);
--- a/src/extractors/collect-all-pages.test.js
+++ b/src/extractors/collect-all-pages.test.js
@ -1,12 +0,0 @@
-// import assert from 'assert';
-// import fs from 'fs';
-// import cheerio from 'cheerio';
-//
-// import collectAllPages from './collect-all-pages';
-//
-describe('collectAllPages(opts)', () => {
-  it('fetches additional pages', () => {
-    //     const html = fs.readFileSync('./fixtures/ars.html');
-    //     const $ = cheerio.load(html);
-  });
-});
--- a/src/extractors/custom/forward.com/index.test.js
+++ b/src/extractors/custom/forward.com/index.test.js
@ -59,16 +59,6 @@ describe('ForwardComExtractor', () => {
      assert.equal(date_published, '2016-12-28T20:32:00.000Z');
    });

-    // it('returns the dek', async () => {
-    // // To pass this test, fill out the dek selector
-    // // in ./src/extractors/custom/forward.com/index.js.
-    //   const { dek } = await result;
-    //
-    // // Update these values with the expected values from
-    // // the article.
-    //   assert.equal(dek, '');
-    // });
-
    it('returns the lead_image_url', async () => {
      // To pass this test, fill out the lead_image_url selector
      // in ./src/extractors/custom/forward.com/index.js.
--- a/src/extractors/custom/twitter.com/index.js
+++ b/src/extractors/custom/twitter.com/index.js
@ -31,9 +31,6 @@ export const TwitterExtractor = {
  },

  date_published: {
-    selectors: [
-      ['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms'],
-      // '.tweet.permalink-tweet .metadata',
-    ],
+    selectors: [['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms']],
  },
 };
--- a/src/extractors/custom/www.huffingtonpost.com/index.js
+++ b/src/extractors/custom/www.huffingtonpost.com/index.js
@ -31,12 +31,7 @@ export const WwwHuffingtonpostComExtractor = {

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
-    transforms: {
-      // 'div.top-media': ($node) => {
-      //   const $figure = $node.children('figure');
-      //   $node.replaceWith($figure);
-      // },
-    },
+    transforms: {},

    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
--- a/src/extractors/custom/www.nytimes.com/index.js
+++ b/src/extractors/custom/www.nytimes.com/index.js
@ -15,15 +15,6 @@ export const NYTimesExtractor = {
    transforms: {
      'img.g-lazy': $node => {
        let src = $node.attr('src');
-        // const widths = $node.attr('data-widths')
-        //                   .slice(1)
-        //                   .slice(0, -1)
-        //                   .split(',');
-        // if (widths.length) {
-        //   width = widths.slice(-1);
-        // } else {
-        //   width = '900';
-        // }
        const width = 640;

        src = src.replace('{{size}}', width);
--- a/src/extractors/custom/www.prospectmagazine.co.uk/index.js
+++ b/src/extractors/custom/www.prospectmagazine.co.uk/index.js
@ -24,10 +24,7 @@ export const WwwProspectmagazineCoUkExtractor = {
  },

  content: {
-    selectors: [
-      // ['article.type-post div.post_content p'],
-      'article .post_content',
-    ],
+    selectors: ['article .post_content'],

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
--- a/src/extractors/custom/www.reuters.com/index.js
+++ b/src/extractors/custom/www.reuters.com/index.js
@ -29,10 +29,6 @@ export const WwwReutersComExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: [
-      '#article-byline .author',
-      // 'span.location',
-      // 'span.articleLocation',
-    ],
+    clean: ['#article-byline .author'],
  },
 };
--- a/src/extractors/custom/www.tmz.com/index.test.js
+++ b/src/extractors/custom/www.tmz.com/index.test.js
@ -61,22 +61,6 @@ describe('WwwTmzComExtractor', () => {
      assert.equal(date_published, '2016-11-28T11:00:00.000Z');
    });

-    // it('returns the dek', async () => {
-    //   // To pass this test, fill out the dek selector
-    //   // in ./src/extractors/custom/www.tmz.com/index.js.
-    //   const html =
-    //     fs.readFileSync('./fixtures/www.tmz.com/1480368537455.html');
-    //   const articleUrl =
-    //     'http://www.tmz.com/2016/11/28/prince-wife-estate-will/';
-    //
-    //   const { dek } =
-    //     await Mercury.parse(articleUrl, html, { fallback: false });
-    //
-    //   // Update these values with the expected values from
-    //   // the article.
-    //   assert.equal(dek, '');
-    // });
-
    it('returns the lead_image_url', async () => {
      // To pass this test, fill out the lead_image_url selector
      // in ./src/extractors/custom/www.tmz.com/index.js.
--- a/src/extractors/generic/content/extract-best-node.js
+++ b/src/extractors/generic/content/extract-best-node.js
@ -14,11 +14,6 @@ import { scoreContent, findTopCandidate } from './scoring';
 //
 // Returns a cheerio object $
 export default function extractBestNode($, opts) {
-  // clone the node so we can get back to our
-  // initial parsed state if needed
-  // TODO Do I need this? – AP
-  // let $root = $.root().clone()
-
  if (opts.stripUnlikelyCandidates) {
    $ = stripUnlikelyCandidates($);
  }
--- a/src/extractors/generic/content/extract-best-node.test.js
+++ b/src/extractors/generic/content/extract-best-node.test.js
@ -1,9 +1,6 @@
 import assert from 'assert';
 import cheerio from 'cheerio';
 import fs from 'fs';
-
-// import HTML from './fixtures/html'
-
 import extractBestNode from './extract-best-node';

 describe('extractBestNode($, flags)', () => {
--- a/src/extractors/generic/content/extractor.js
+++ b/src/extractors/generic/content/extractor.js
@ -80,11 +80,6 @@ const GenericContentExtractor = {
    }

    return normalizeSpaces($.html(node));
-
-    // if return_type == "html":
-    //     return normalize_spaces(node_to_html(node))
-    // else:
-    //     return node
  },
 };

--- a/src/extractors/generic/content/extractor.test.js
+++ b/src/extractors/generic/content/extractor.test.js
@ -9,13 +9,6 @@ describe('GenericContentExtractor', () => {
  describe('extract($, html, opts)', () => {
    it('extracts html and returns the article', () => {
      const html = fs.readFileSync('./fixtures/vulture.html', 'utf-8');
-
-      // Array.from(range(1, 100)).map((i) => {
-      //   console.log(i)
-      //   clean(GenericContentExtractor.extract(
-      //     { $: null, html, url: 'http://example.com' }
-      //   ))
-      // })
      const result = clean(
        GenericContentExtractor.extract({
          $: null,
@ -26,7 +19,6 @@ describe('GenericContentExtractor', () => {
      );

      assert(typeof result, 'string');
-      // console.log(result)
    });
  });
 });
--- a/src/extractors/generic/date-published/constants.js
+++ b/src/extractors/generic/date-published/constants.js
@ -49,12 +49,7 @@ export const DATE_PUBLISHED_SELECTORS = [
 // reference be a date string that is parseable by dateutil.parser.parse
 const abbrevMonthsStr = '(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)';
 export const DATE_PUBLISHED_URL_RES = [
-  // /2012/01/27/ but not /2012/01/293
  new RegExp('/(20\\d{2}/\\d{2}/\\d{2})/', 'i'),
-  // 20120127 or 20120127T but not 2012012733 or 8201201733
-  // /[^0-9](20\d{2}[01]\d[0-3]\d)([^0-9]|$)/i,
-  // 2012-01-27
  new RegExp('(20\\d{2}-[01]\\d-[0-3]\\d)', 'i'),
-  // /2012/jan/27/
  new RegExp(`/(20\\d{2}/${abbrevMonthsStr}/[0-3]\\d)/`, 'i'),
 ];
--- a/src/extractors/generic/dek/constants.js
+++ b/src/extractors/generic/dek/constants.js
--- a/src/extractors/generic/dek/extractor.js
+++ b/src/extractors/generic/dek/extractor.js
@ -1,51 +1,11 @@
-// import {
-//   DEK_META_TAGS,
-//   DEK_SELECTORS,
-//   DEK_URL_RES,
-// } from './constants';
-
-// import { cleanDek } from 'cleaners';
-
-// import {
-//   extractFromMeta,
-//   extractFromSelectors,
-// } from 'utils/dom';
-
 // Currently there is only one selector for
 // deks. We should simply return null here
 // until we have a more robust generic option.
 // Below is the original source for this, for reference.
 const GenericDekExtractor = {
-  // extract({ $, content, metaCache }) {
  extract() {
    return null;
  },
 };

 export default GenericDekExtractor;
-
-// def extract_dek(self):
-//     # First, check to see if we have a matching meta tag that we can make
-//     # use of.
-//     dek = self.extract_from_meta('dek', constants.DEK_META_TAGS)
-//     if not dek:
-//         # Second, look through our CSS/XPath selectors. This may return
-//         # an HTML fragment.
-//         dek = self.extract_from_selectors('dek',
-//                                            constants.DEK_SELECTORS,
-//                                            text_only=False)
-//
-//     if dek:
-//         # Make sure our dek isn't in the first few thousand characters
-//         # of the content, otherwise it's just the start of the article
-//         # and not a true dek.
-//         content = self.extract_content()
-//         content_chunk = normalize_spaces(strip_tags(content[:2000]))
-//         dek_chunk = normalize_spaces(dek[:100]) # Already has no tags.
-//
-//         # 80% or greater similarity means the dek was very similar to some
-//         # of the starting content, so we skip it.
-//         if fuzz.partial_ratio(content_chunk, dek_chunk) < 80:
-//             return dek
-//
-//     return None
--- a/src/extractors/generic/lead-image-url/extractor.js
+++ b/src/extractors/generic/lead-image-url/extractor.js
@ -111,158 +111,3 @@ const GenericLeadImageUrlExtractor = {
 };

 export default GenericLeadImageUrlExtractor;
-
-// def extract(self):
-//     """
-//     # First, try to find the "best" image via the content.
-//     # We'd rather not have to fetch each image and check dimensions,
-//     # so try to do some analysis and determine them instead.
-//     content = self.extractor.extract_content(return_type="node")
-//     imgs = content.xpath('.//img')
-//     img_scores = defaultdict(int)
-//     logger.debug('Scoring %d images from content', len(imgs))
-//     for (i, img) in enumerate(imgs):
-//         img_score = 0
-//
-//         if not 'src' in img.attrib:
-//             logger.debug('No src attribute found')
-//             continue
-//
-//         try:
-//             parsed_img = urlparse(img.attrib['src'])
-//             img_path = parsed_img.path.lower()
-//         except ValueError:
-//             logger.debug('ValueError getting img path.')
-//             continue
-//         logger.debug('Image path is %s', img_path)
-//
-//         if constants.POSITIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
-//             logger.debug('Positive URL hints match. Adding 20.')
-//             img_score += 20
-//
-//         if constants.NEGATIVE_LEAD_IMAGE_URL_HINTS_RE.match(img_path):
-//             logger.debug('Negative URL hints match. Subtracting 20.')
-//             img_score -= 20
-//
-//         # Gifs are more often structure than photos
-//         if img_path.endswith('gif'):
-//             logger.debug('gif found. Subtracting 10.')
-//             img_score -= 10
-//
-//         # JPGs are more often photographs
-//         if img_path.endswith('jpg'):
-//             logger.debug('jpg found. Adding 10.')
-//             img_score += 10
-//
-//         # PNGs are neutral.
-//
-//         # Alt attribute usually means non-presentational image.
-//         if 'alt' in img.attrib and len(img.attrib['alt']) > 5:
-//             logger.debug('alt attribute found. Adding 5.')
-//             img_score += 5
-//
-//         # Look through our parent and grandparent for figure-like
-//         # container elements, give a bonus if we find them
-//         parents = [img.getparent()]
-//         if parents[0] is not None and parents[0].getparent() is not None:
-//             parents.append(parents[0].getparent())
-//         for p in parents:
-//             if p.tag == 'figure':
-//                 logger.debug('Parent with <figure> tag found. Adding 25.')
-//                 img_score += 25
-//
-//             p_sig = ' '.join([p.get('id', ''), p.get('class', '')])
-//             if constants.PHOTO_HINTS_RE.search(p_sig):
-//                 logger.debug('Photo hints regex match. Adding 15.')
-//                 img_score += 15
-//
-//         # Look at our immediate sibling and see if it looks like it's a
-//         # caption. Bonus if so.
-//         sibling = img.getnext()
-//         if sibling is not None:
-//             if sibling.tag == 'figcaption':
-//                 img_score += 25
-//
-//             sib_sig = ' '.join([sibling.get('id', ''),
-//                                 sibling.get('class', '')]).lower()
-//             if 'caption' in sib_sig:
-//                 img_score += 15
-//
-//         # Pull out width/height if they were set.
-//         img_width = None
-//         img_height = None
-//         if 'width' in img.attrib:
-//             try:
-//                 img_width = float(img.get('width'))
-//             except ValueError:
-//                 pass
-//         if 'height' in img.attrib:
-//             try:
-//                 img_height = float(img.get('height'))
-//             except ValueError:
-//                 pass
-//
-//         # Penalty for skinny images
-//         if img_width and img_width <= 50:
-//             logger.debug('Skinny image found. Subtracting 50.')
-//             img_score -= 50
-//
-//         # Penalty for short images
-//         if img_height and img_height <= 50:
-//             # Wide, short images are more common than narrow, tall ones
-//             logger.debug('Short image found. Subtracting 25.')
-//             img_score -= 25
-//
-//         if img_width and img_height and not 'sprite' in img_path:
-//             area = img_width * img_height
-//
-//             if area < 5000: # Smaller than 50x100
-//                 logger.debug('Image with small area found. Subtracting 100.')
-//                 img_score -= 100
-//             else:
-//                 img_score += round(area/1000.0)
-//
-//         # If the image is higher on the page than other images,
-//         # it gets a bonus. Penalty if lower.
-//         logger.debug('Adding page placement bonus of %d.', len(imgs)/2 - i)
-//         img_score += len(imgs)/2 - i
-//
-//         # Use the raw src here because we munged img_path for case
-//         # insensitivity
-//         logger.debug('Final score is %d.', img_score)
-//         img_scores[img.attrib['src']] += img_score
-//
-//     top_score = 0
-//     top_url = None
-//     for (url, score) in img_scores.items():
-//         if score > top_score:
-//             top_url = url
-//             top_score = score
-//
-//     if top_score > 0:
-//         logger.debug('Using top score image from content. Score was %d', top_score)
-//         return top_url
-//
-//
-//     # If nothing else worked, check to see if there are any really
-//     # probable nodes in the doc, like <link rel="image_src" />.
-//     logger.debug('Trying to find lead image in probable nodes')
-//     for selector in constants.LEAD_IMAGE_URL_SELECTORS:
-//         nodes = self.resource.extract_by_selector(selector)
-//         for node in nodes:
-//             clean_value = None
-//             if node.attrib.get('src'):
-//                 clean_value = self.clean(node.attrib['src'])
-//
-//             if not clean_value and node.attrib.get('href'):
-//                 clean_value = self.clean(node.attrib['href'])
-//
-//             if not clean_value and node.attrib.get('value'):
-//                 clean_value = self.clean(node.attrib['value'])
-//
-//             if clean_value:
-//                 logger.debug('Found lead image in probable nodes.')
-//                 logger.debug('Node was: %s', node)
-//                 return clean_value
-//
-//     return None
--- a/src/test-helpers.js
+++ b/src/test-helpers.js
@ -1,8 +1,8 @@
 import assert from 'assert';
 import nock from 'nock'; // eslint-disable-line import/no-extraneous-dependencies
-// import fs from 'fs';
 import path from 'path';
 import cheerio from 'cheerio';
+// import fs from 'fs';	

 export function clean(string) {
  return string
@ -26,7 +26,6 @@ export function record(name, options = {}) {
  let has_fixtures = !!process.env.NOCK_RECORD;

  return {
-    // starts recording, or ensure the fixtures exist
    before: () => {
      if (cheerio.browser) return;
      if (!has_fixtures) {
@ -45,18 +44,19 @@ export function record(name, options = {}) {
        });
      }
    },
-    // saves our recording if fixtures didn't already exist
+
    after: done => {
      if (!has_fixtures && !cheerio.browser) {
        has_fixtures = nock.recorder.play();
        // eslint-disable-next-line no-console
        console.log(
          `This is disabled for browser/node interop. To capture fixutres,
-          open ${'`src/test-helpers.js`'} and comment out lines 55 and 56 and
-          uncomment fs import at top of file.`
+          open ${'`src/test-helpers.js`'} and comment out lines 57 and 58 and
+          uncomment the fs import at top of file.`
        );
        // const text = `const nock = require('nock');\n${has_fixtures.join('\n')}`;
        // fs.writeFile(fp, text, done);
+
      } else {
        done();
      }
--- a/src/utils/dom/brs-to-ps.js
+++ b/src/utils/dom/brs-to-ps.js
@ -20,7 +20,6 @@ export default function brsToPs($) {
      $element.remove();
    } else if (collapsing) {
      collapsing = false;
-      // $(element).replaceWith('<p />')
      paragraphize(element, $, true);
    }
  });
--- a/src/utils/dom/clean-attributes.js
+++ b/src/utils/dom/clean-attributes.js
@ -24,12 +24,6 @@ function removeAllButWhitelist($article, $) {
  return $article;
 }

-// function removeAttrs(article, $) {
-//   REMOVE_ATTRS.forEach((attr) => {
-//     $(`[${attr}]`, article).removeAttr(attr);
-//   });
-// }
-
 // Remove attributes like style or align
 export default function cleanAttributes($article, $) {
  // Grabbing the parent because at this point
--- a/src/utils/dom/convert-node-to.js
+++ b/src/utils/dom/convert-node-to.js
@ -6,7 +6,6 @@ export default function convertNodeTo($node, $, tag = 'p') {
    return $;
  }
  const attrs = getAttrs(node) || {};
-  // console.log(attrs)

  const attribString = Reflect.ownKeys(attrs)
    .map(key => `${key}=${attrs[key]}`)