release: 1.0.7 (#160)

7 years ago · e267d57d78
parent f13bb721f6
commit e267d57d78
6 changed files with 302 additions and 19 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,22 @@
 # Mercury Parser Changelog

+### 1.0.7 (Mar 15, 2017)
+
+##### Commits
+
+* [[`f13bb721f6`](https://github.com/postlight/mercury-parser/commit/f13bb721f6)] - **feat**: prospect magazine parser (#147) (Janet) 
+* [[`1b28713cf5`](https://github.com/postlight/mercury-parser/commit/1b28713cf5)] - **feat**: fool.com parser (#158) (Kevin Ngao) 
+* [[`c18959779d`](https://github.com/postlight/mercury-parser/commit/c18959779d)] - **feat**: forward.com parser (#144) (Janet) 
+* [[`50e548bac2`](https://github.com/postlight/mercury-parser/commit/50e548bac2)] - **feat**: qdaily parser (#146) (Janet) 
+* [[`51a4d1d12f`](https://github.com/postlight/mercury-parser/commit/51a4d1d12f)] - **feat**: newrepublic parser shows image on page (#159) (Silas Burton) 
+* [[`11382ce651`](https://github.com/postlight/mercury-parser/commit/11382ce651)] - **feat**: Slate extractor (#153) (Silas Burton) 
+* [[`5acaa6ab56`](https://github.com/postlight/mercury-parser/commit/5acaa6ab56)] - **feat**: ici.radio-canada.ca extractor (#156) (Silas Burton) 
+* [[`4509b341e6`](https://github.com/postlight/mercury-parser/commit/4509b341e6)] - **feat**: better cleanup of atlantic articles (#157) (Silas Burton) 
+* [[`f2e3f055c2`](https://github.com/postlight/mercury-parser/commit/f2e3f055c2)] - **fix**: an issue with encoding (#154) (Kevin Ngao) 
+* [[`9b371e51ac`](https://github.com/postlight/mercury-parser/commit/9b371e51ac)] - **feat**: gothamist extractor (#151) (Silas Burton) 
+* [[`afbef9bc39`](https://github.com/postlight/mercury-parser/commit/afbef9bc39)] - **fix**: Encoding on Body (#143) (Kevin Ngao) 
+* [[`9d4c883d51`](https://github.com/postlight/mercury-parser/commit/9d4c883d51)] - **release**: 1.0.6 (#142) (Adam Pash) 
+
 ### 1.0.6 (Feb 9, 2017)

 ##### Commits
--- a/dist/mercury.js
+++ b/dist/mercury.js
@ -2120,7 +2120,7 @@ var TheAtlanticExtractor = {
    // Is there anything that is in the result that shouldn't be?
    // The clean selectors will remove anything that matches from
    // the result
-    clean: ['.partner-box']
+    clean: ['.partner-box', '.callout']
  },

  date_published: {
@ -2904,7 +2904,7 @@ var NewrepublicComExtractor = {
  },

  content: {
-    selectors: ['div.content-body', '.minutes-primary div.content-body'],
+    selectors: [['.article-cover', 'div.content-body'], ['.minute-image', '.minutes-primary div.content-body']],

    // Is there anything in the content you selected that needs transformed
    // before it's consumable content? E.g., unusual lazy loaded images
@ -5309,6 +5309,123 @@ var WwwOpposingviewsComExtractor = {
  }
 };

+var WwwProspectmagazineCoUkExtractor = {
+  domain: 'www.prospectmagazine.co.uk',
+
+  title: {
+    selectors: ['.page-title']
+  },
+
+  author: {
+    selectors: ['.aside_author .title']
+  },
+
+  date_published: {
+    selectors: ['.post-info'],
+
+    timezone: 'Europe/London'
+  },
+
+  dek: {
+    selectors: ['.page-subtitle']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: [
+    // ['article.type-post div.post_content p'],
+    'article .post_content'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+
+var ForwardComExtractor = {
+  domain: 'forward.com',
+
+  title: {
+    selectors: [['meta[name="og:title"]', 'value']]
+  },
+
+  author: {
+    selectors: ['.author-name', ['meta[name="sailthru.author"]', 'value']]
+  },
+
+  date_published: {
+    selectors: [['meta[name="date"]', 'value']]
+  },
+
+  dek: {
+    selectors: [
+      // enter selectors
+    ]
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: [['.post-item-media-wrap', '.post-item p']],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.donate-box', '.message', '.subtitle']
+  }
+};
+
+var WwwQdailyComExtractor = {
+  domain: 'www.qdaily.com',
+
+  title: {
+    selectors: ['h2', 'h2.title']
+  },
+
+  author: {
+    selectors: ['.name']
+  },
+
+  date_published: {
+    selectors: [['.date.smart-date', 'data-origindate']]
+  },
+
+  dek: {
+    selectors: ['.excerpt']
+  },
+
+  lead_image_url: {
+    selectors: [['.article-detail-hd img', 'src']]
+  },
+
+  content: {
+    selectors: ['.detail'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.lazyload', '.lazylad', '.lazylood']
+  }
+};
+
 var GothamistComExtractor = {
  domain: 'gothamist.com',

@ -5357,6 +5474,127 @@ var GothamistComExtractor = {
  }
 };

+var WwwFoolComExtractor = {
+  domain: 'www.fool.com',
+
+  title: {
+    selectors: ['h1']
+  },
+
+  author: {
+    selectors: ['.author-inline .author-name']
+  },
+
+  date_published: {
+    selectors: [['meta[name="date"]', 'value']]
+  },
+
+  dek: {
+    selectors: ['header h2']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['.article-content'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {
+      '.caption img': function captionImg($node) {
+        var src = $node.attr('src');
+        $node.parent().replaceWith('<figure><img src="' + src + '"/></figure>');
+      },
+      '.caption': 'figcaption'
+    },
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['#pitch']
+  }
+};
+
+var WwwSlateComExtractor = {
+  domain: 'www.slate.com',
+
+  title: {
+    selectors: ['.hed', 'h1']
+  },
+
+  author: {
+    selectors: ['a[rel=author]']
+  },
+
+  date_published: {
+    selectors: ['.pub-date'],
+
+    timezone: 'America/New_York'
+  },
+
+  dek: {
+    selectors: ['.dek']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: ['.body'],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: ['.about-the-author', '.pullquote', '.newsletter-signup-component', '.top-comment']
+  }
+};
+
+var IciRadioCanadaCaExtractor = {
+  domain: 'ici.radio-canada.ca',
+
+  title: {
+    selectors: ['h1']
+  },
+
+  author: {
+    selectors: [['meta[name="dc.creator"]', 'value']]
+  },
+
+  date_published: {
+    selectors: [['meta[name="dc.date.created"]', 'value']],
+
+    timezone: 'America/New_York'
+  },
+
+  dek: {
+    selectors: ['.bunker-component.lead']
+  },
+
+  lead_image_url: {
+    selectors: [['meta[name="og:image"]', 'value']]
+  },
+
+  content: {
+    selectors: [['.main-multimedia-item', '.news-story-content']],
+
+    // Is there anything in the content you selected that needs transformed
+    // before it's consumable content? E.g., unusual lazy loaded images
+    transforms: {},
+
+    // Is there anything that is in the result that shouldn't be?
+    // The clean selectors will remove anything that matches from
+    // the result
+    clean: []
+  }
+};
+


 var CustomExtractors = Object.freeze({
@ -5444,7 +5682,13 @@ var CustomExtractors = Object.freeze({
 	WwwLinkedinComExtractor: WwwLinkedinComExtractor,
 	ObamawhitehouseArchivesGovExtractor: ObamawhitehouseArchivesGovExtractor,
 	WwwOpposingviewsComExtractor: WwwOpposingviewsComExtractor,
-	GothamistComExtractor: GothamistComExtractor
+	WwwProspectmagazineCoUkExtractor: WwwProspectmagazineCoUkExtractor,
+	ForwardComExtractor: ForwardComExtractor,
+	WwwQdailyComExtractor: WwwQdailyComExtractor,
+	GothamistComExtractor: GothamistComExtractor,
+	WwwFoolComExtractor: WwwFoolComExtractor,
+	WwwSlateComExtractor: WwwSlateComExtractor,
+	IciRadioCanadaCaExtractor: IciRadioCanadaCaExtractor
 });

 var Extractors = _Object$keys(CustomExtractors).reduce(function (acc, key) {
--- a/dist/mercury.js.map
+++ b/dist/mercury.js.map
--- a/dist/mercury.web.js
+++ b/dist/mercury.web.js
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "mercury-parser",
-  "version": "1.0.6",
+  "version": "1.0.7",
  "description": "",
  "repository": "github:postlight/mercury-parser",
  "main": "./dist/mercury.js",
@ -68,7 +68,7 @@
    "ora": "^0.3.0",
    "phantomjs-polyfill-find": "ptim/phantomjs-polyfill-find",
    "phantomjs-polyfill-string-includes": "^1.0.0",
-    "phantomjs-prebuilt": "^2.1.13",
+    "phantomjs-prebuilt": "^2.1.14",
    "requirejs": "^2.3.2",
    "rollup": "^0.36.3",
    "rollup-plugin-babel": "^2.6.1",
--- a/yarn.lock
+++ b/yarn.lock
@ -2248,6 +2248,14 @@ fs-extra@~0.30.0:
    path-is-absolute "^1.0.0"
    rimraf "^2.2.8"

+fs-extra@~1.0.0:
+  version "1.0.0"
+  resolved "https://registry.yarnpkg.com/fs-extra/-/fs-extra-1.0.0.tgz#cd3ce5f7e7cb6145883fcae3191e9877f8587950"
+  dependencies:
+    graceful-fs "^4.1.2"
+    jsonfile "^2.1.0"
+    klaw "^1.0.0"
+
 fs.realpath@^1.0.0:
  version "1.0.0"
  resolved "https://registry.yarnpkg.com/fs.realpath/-/fs.realpath-1.0.0.tgz#1504ad2523158caa40db4a2787cb01411994ea4f"
@ -4097,7 +4105,21 @@ phantomjs-polyfill-string-includes@^1.0.0:
  version "1.0.0"
  resolved "https://registry.yarnpkg.com/phantomjs-polyfill-string-includes/-/phantomjs-polyfill-string-includes-1.0.0.tgz#ea180d4bbc24b8d83e477f8ee8893efabcb29393"

-phantomjs-prebuilt@^2.1.13, phantomjs-prebuilt@^2.1.7:
+phantomjs-prebuilt@^2.1.14:
+  version "2.1.14"
+  resolved "https://registry.yarnpkg.com/phantomjs-prebuilt/-/phantomjs-prebuilt-2.1.14.tgz#d53d311fcfb7d1d08ddb24014558f1188c516da0"
+  dependencies:
+    es6-promise "~4.0.3"
+    extract-zip "~1.5.0"
+    fs-extra "~1.0.0"
+    hasha "~2.2.0"
+    kew "~0.7.0"
+    progress "~1.1.8"
+    request "~2.79.0"
+    request-progress "~2.0.1"
+    which "~1.2.10"
+
+phantomjs-prebuilt@^2.1.7:
  version "2.1.13"
  resolved "https://registry.yarnpkg.com/phantomjs-prebuilt/-/phantomjs-prebuilt-2.1.13.tgz#66556ad9e965d893ca5a7dc9e763df7e8697f76d"
  dependencies:
@ -4423,7 +4445,7 @@ request-promise@^4.1.1:
    request-promise-core "1.1.1"
    stealthy-require "^1.0.0"

-request@^2.55.0, request@^2.72.0, request@^2.75.0:
+request@^2.55.0, request@^2.72.0, request@^2.75.0, request@~2.79.0:
  version "2.79.0"
  resolved "https://registry.yarnpkg.com/request/-/request-2.79.0.tgz#4dfe5bf6be8b8cdc37fcf93e04b65577722710de"
  dependencies: