From 2d5f59f3eb347c5f753cdc8828adcf16aaf8189b Mon Sep 17 00:00:00 2001 From: Nicolas Perriault Date: Mon, 23 Mar 2015 10:07:04 +0100 Subject: [PATCH] Fixes #56 - Updated support for embedded Youtube & Vimeo videos. --- Readability.js | 15 ++++--- .../embedded-videos/expected-metadata.json | 5 +++ test/test-pages/embedded-videos/expected.html | 36 ++++++++++++++++ test/test-pages/embedded-videos/source.html | 43 +++++++++++++++++++ test/test-readability.js | 2 +- 5 files changed, 93 insertions(+), 8 deletions(-) create mode 100644 test/test-pages/embedded-videos/expected-metadata.json create mode 100644 test/test-pages/embedded-videos/expected.html create mode 100644 test/test-pages/embedded-videos/source.html diff --git a/Readability.js b/Readability.js index d532977..fc834ad 100644 --- a/Readability.js +++ b/Readability.js @@ -96,7 +96,7 @@ Readability.prototype = { byline: /byline|author|dateline|writtenby/i, replaceFonts: /<(\/?)font[^>]*>/gi, normalize: /\s{2,}/g, - videos: /https?:\/\/(www\.)?(youtube|vimeo)\.com/i, + videos: /https?:\/\/(www\.)?(youtube|youtube-nocookie|player\.vimeo)\.com/i, nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i, prevLink: /(prev|earl|old|new|<|«)/i, whitespace: /^\s*$/, @@ -366,7 +366,9 @@ Readability.prototype = { var imgCount = paragraph.getElementsByTagName('img').length; var embedCount = paragraph.getElementsByTagName('embed').length; var objectCount = paragraph.getElementsByTagName('object').length; - var totalCount = imgCount + embedCount + objectCount; + // At this point, nasty iframes have been removed, only remain embedded video ones. + var iframeCount = paragraph.getElementsByTagName('iframe').length; + var totalCount = imgCount + embedCount + objectCount + iframeCount; if (totalCount === 0 && !this._getInnerText(paragraph, false)) paragraph.parentNode.removeChild(paragraph); @@ -1412,15 +1414,14 @@ Readability.prototype = { * @return void **/ _clean: function(e, tag) { - var isEmbed = (tag === 'object' || tag === 'embed'); + var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1; this._forEachNode(e.getElementsByTagName(tag), function(element) { // Allow youtube and vimeo videos through as people usually want to see those. if (isEmbed) { - var attributeValues = ""; - for (var i = 0, il = element.attributes.length; i < il; i += 1) { - attributeValues += element.attributes[i].value + '|'; - } + var attributeValues = [].map.call(element.attributes, function(attr) { + return attr.value; + }).join("|"); // First, check the elements attributes to see if any of them contain youtube or vimeo if (this.REGEXPS.videos.test(attributeValues)) diff --git a/test/test-pages/embedded-videos/expected-metadata.json b/test/test-pages/embedded-videos/expected-metadata.json new file mode 100644 index 0000000..8663ff4 --- /dev/null +++ b/test/test-pages/embedded-videos/expected-metadata.json @@ -0,0 +1,5 @@ +{ + "title": "Embedded videos test", + "byline": null, + "excerpt": "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod\n tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,\n quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo\n consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse\n cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non\n proident, sunt in culpa qui officia deserunt mollit anim id est laborum." +} diff --git a/test/test-pages/embedded-videos/expected.html b/test/test-pages/embedded-videos/expected.html new file mode 100644 index 0000000..b37810a --- /dev/null +++ b/test/test-pages/embedded-videos/expected.html @@ -0,0 +1,36 @@ +
+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Videos

+ +

At root

+ + + +

In a paragraph

+

+ +

+

In a div

+

+ +

+

Foo

+ +

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat + non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+
\ No newline at end of file diff --git a/test/test-pages/embedded-videos/source.html b/test/test-pages/embedded-videos/source.html new file mode 100644 index 0000000..ee0a9d0 --- /dev/null +++ b/test/test-pages/embedded-videos/source.html @@ -0,0 +1,43 @@ + + + + + Embedded videos test + + +
+

Lorem

+
+

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod + tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+
+

Videos

+

At root

+ + + +

In a paragraph

+

+

In a div

+
+

Foo

+
+ Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, + quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo + consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse + cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non + proident, sunt in culpa qui officia deserunt mollit anim id est laborum. +
+
+ + diff --git a/test/test-readability.js b/test/test-readability.js index 5425834..1141abb 100644 --- a/test/test-readability.js +++ b/test/test-readability.js @@ -63,7 +63,7 @@ describe("Test page", function() { pathBase: "http://fakehost/test" }; - beforeEach(function() { + before(function() { doc = new JSDOMParser().parse(source); result = new Readability(uri, doc).parse(); });