From 19f2124c82eeeefc82f89ce8f68a60f73d5b4a1f Mon Sep 17 00:00:00 2001 From: Evan Tseng Date: Wed, 25 Jan 2017 16:45:12 +0800 Subject: [PATCH] Bug 1259763 - Remove h2 when there is only one h2 and its text content substantially equals article title, r=Gijs --- Readability.js | 21 +- test/test-pages/base-url/expected.html | 1 + test/test-pages/cnn/expected-metadata.json | 7 + test/test-pages/cnn/expected.html | 56 + test/test-pages/cnn/source.html | 4190 +++++++++++++++++ test/test-pages/medium-1/expected.html | 1 + .../test-pages/normalize-spaces/expected.html | 1 + .../replace-font-tags/expected.html | 1 + .../style-tags-removal/expected.html | 1 + test/test-pages/tmz-1/expected.html | 1 + 10 files changed, 4274 insertions(+), 6 deletions(-) create mode 100644 test/test-pages/cnn/expected-metadata.json create mode 100644 test/test-pages/cnn/expected.html create mode 100644 test/test-pages/cnn/source.html diff --git a/Readability.js b/Readability.js index 94055c8..5eff2cf 100644 --- a/Readability.js +++ b/Readability.js @@ -32,6 +32,7 @@ function Readability(uri, doc, options) { this._uri = uri; this._doc = doc; this._biggestFrame = false; + this._articleTitle = null; this._articleByline = null; this._articleDir = null; @@ -482,10 +483,18 @@ Readability.prototype = { this._cleanMatchedNodes(topCandidate, /share/); }); - // If there is only one h2, they are probably using it as a header - // and not a subheader, so remove it since we already have a header. - if (articleContent.getElementsByTagName('h2').length === 1) - this._clean(articleContent, "h2"); + // If there is only one h2 and its text content substantially equals article title, + // they are probably using it as a header and not a subheader, + // so remove it since we already extract the title separately. + var h2 = articleContent.getElementsByTagName('h2'); + if (h2.length === 1) { + var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length; + if (Math.abs(lengthSimilarRate) < 0.5 && + (lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) : + this._articleTitle.includes(h2[0].textContent))) { + this._clean(articleContent, "h2"); + } + } this._clean(articleContent, "iframe"); this._clean(articleContent, "input"); @@ -1920,7 +1929,7 @@ Readability.prototype = { this._prepDocument(); var metadata = this._getArticleMetadata(); - var articleTitle = metadata.title; + this._articleTitle = metadata.title; var articleContent = this._grabArticle(); if (!articleContent) @@ -1951,7 +1960,7 @@ Readability.prototype = { var textContent = articleContent.textContent; return { uri: this._uri, - title: articleTitle, + title: this._articleTitle, byline: metadata.byline || this._articleByline, dir: this._articleDir, content: articleContent.innerHTML, diff --git a/test/test-pages/base-url/expected.html b/test/test-pages/base-url/expected.html index c06f1fd..a865368 100644 --- a/test/test-pages/base-url/expected.html +++ b/test/test-pages/base-url/expected.html @@ -16,6 +16,7 @@

+

Foo

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

diff --git a/test/test-pages/cnn/expected-metadata.json b/test/test-pages/cnn/expected-metadata.json new file mode 100644 index 0000000..723989b --- /dev/null +++ b/test/test-pages/cnn/expected-metadata.json @@ -0,0 +1,7 @@ +{ + "title": "The 'birth lottery' and economic mobility", + "byline": "Ahiza Garcia", + "dir": null, + "excerpt": "A recently-released report on poverty and inequality found that the U.S. ranks the lowest among countries with welfare states.", + "readerable": true +} diff --git a/test/test-pages/cnn/expected.html b/test/test-pages/cnn/expected.html new file mode 100644 index 0000000..d2a4433 --- /dev/null +++ b/test/test-pages/cnn/expected.html @@ -0,0 +1,56 @@ +
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+

The U.S. has long been heralded as a land of opportunity -- a place where anyone can succeed regardless of the economic class they were born into.

+

But a new report released on Monday by Stanford University's Center on Poverty and Inequality calls that into question.

+

The report assessed poverty levels, income and wealth inequality, economic mobility and unemployment levels among 10 wealthy countries with social welfare programs.

+
+
+
+
+
+

+ Powered by SmartAsset.com +

+ +
+
+
+
+
+

Among its key findings: the class you're born into matters much more in the U.S. than many of the other countries.

+

As the report states: "[T]he birth lottery matters more in the U.S. than in most well-off countries."

+

But this wasn't the only finding that suggests the U.S. isn't quite living up to its reputation as a country where everyone has an equal chance to get ahead through sheer will and hard work.

+

Related: Rich are paying more in taxes but not as much as they used to

+

The report also suggested the U.S. might not be the "jobs machine" it thinks it is, when compared to other countries.

+

It ranked near the bottom of the pack based on the levels of unemployment among men and women of prime working age. The study determined this by taking the ratio of employed men and women between the ages of 25 and 54 compared to the total population of each country.

+

The overall rankings of the countries were as follows:
1. Finland
2. Norway
3. Australia
4. Canada
5. Germany
6. France
7. United Kingdom
8. Italy
9. Spain
10. United States
+
+
+
+
+
+
+
+
+

+

The low ranking the U.S. received was due to its extreme levels of wealth and income inequality and the ineffectiveness of its "safety net" -- social programs aimed at reducing poverty.

+

Related: Chicago is America's most segregated city

+

The report concluded that the American safety net was ineffective because it provides only half the financial help people need. Additionally, the levels of assistance in the U.S. are generally lower than in other countries.

+ +
+
diff --git a/test/test-pages/cnn/source.html b/test/test-pages/cnn/source.html new file mode 100644 index 0000000..a4950a1 --- /dev/null +++ b/test/test-pages/cnn/source.html @@ -0,0 +1,4190 @@ + + + + + + + + + + + + + + + + + + + + + + + The 'birth lottery' and economic mobility - Feb. 1, 2016 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ +
+ + + + + + + + + + + + +
+
+
+
+
+
+
+
+
+ + + + + +
+
+
+ +
+
+
+
+
+
+ +

The 'birth lottery' and economic mobility

+ +
+
+
+
+ +
+
+
+
+
+
+ +
+
+ +
+
+
+
+
+
+ +
+
+
+
The priest saving LA's gang members
+
Your video will play in 00:25
+
+
+
+ +

The U.S. has long been heralded as a land of opportunity -- a place where anyone can succeed regardless of the economic class they were born into.

+

But a new report released on Monday by Stanford University's Center on Poverty and Inequality calls that into question.

+
+ +
+

The report assessed poverty levels, income and wealth inequality, economic mobility and unemployment levels among 10 wealthy countries with social welfare programs.

+
+
+
+
+
+ + + + + +
+ Powered by SmartAsset.com +
+ + + + + + + + + +
+
+
+
+
+

Among its key findings: the class you're born into matters much more in the U.S. than many of the other countries.

+

As the report states: "[T]he birth lottery matters more in the U.S. than in most well-off countries."

+ +

But this wasn't the only finding that suggests the U.S. isn't quite living up to its reputation as a country where everyone has an equal chance to get ahead through sheer will and hard work.

+

Related: Rich are paying more in taxes but not as much as they used to

+
+
+
ADVERTISING
+
+ +
+
+

The report also suggested the U.S. might not be the "jobs machine" it thinks it is, when compared to other countries.

+

It ranked near the bottom of the pack based on the levels of unemployment among men and women of prime working age. The study determined this by taking the ratio of employed men and women between the ages of 25 and 54 compared to the total population of each country.

+

The overall rankings of the countries were as follows:
1. Finland
2. Norway
3. Australia
4. Canada
5. Germany
6. France
7. United Kingdom
8. Italy
9. Spain
10. United States
+
+
+
+
+
+
+
+
+

+

The low ranking the U.S. received was due to its extreme levels of wealth and income inequality and the ineffectiveness of its "safety net" -- social programs aimed at reducing poverty.

+

Related: Chicago is America's most segregated city

+

The report concluded that the American safety net was ineffective because it provides only half the financial help people need. Additionally, the levels of assistance in the U.S. are generally lower than in other countries.

+
+
+ +
+ +
+
+
+
+
+
+ +
+
+
+ + +
+
+ +
+
+ +
+
+ + + + + +
+ +

Social Surge - What's Trending

+
+
+ +
+ +
+
+ +
+
+ + +
+

Mortgage & Savings + +

+
+ +
+ + + +
+
+ Terms & Conditions apply +

NMLS #1136

+
+
+
+
+
+ +
+

Search for Jobs + +

+ +
+
+
+
+ +
+
+
+
+

LendingTree + +

+
+ +
+
+
+ + +
+

Newsletter

+ + +
+ + +
+
+

CNNMoney Sponsors

+
+
    +
  • + +
  • +
  • + +
  • +
  • + +
  • +
  • + +
  • +
  • + +
  • +
+
+
+ + + + + +
+

Partner Offers + +

+
+
    + + +
+
+
+ + +
+
+ +
+
+
+
+
+ + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+
+
+
+
+
+
+
+
+
+
+
+
+ + + + + + + +
+ +
+
+ + +
+ + + + + +
+ + + + +
+
+
+

+
+
+
+
+ +
+
+ + +
+ + + \ No newline at end of file diff --git a/test/test-pages/medium-1/expected.html b/test/test-pages/medium-1/expected.html index 7a642e5..8cbe509 100644 --- a/test/test-pages/medium-1/expected.html +++ b/test/test-pages/medium-1/expected.html @@ -1,5 +1,6 @@
+

Open Journalism Project:

Better Student Journalism


We pushed out the first version of the Open Journalism site in January. Our goal is for the site to be a place to teach students what they should know about journalism on the web. It should be fun too.

diff --git a/test/test-pages/normalize-spaces/expected.html b/test/test-pages/normalize-spaces/expected.html index 66175cf..cf3db51 100644 --- a/test/test-pages/normalize-spaces/expected.html +++ b/test/test-pages/normalize-spaces/expected.html @@ -1,6 +1,7 @@

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tab here incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Foo

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

\ No newline at end of file diff --git a/test/test-pages/replace-font-tags/expected.html b/test/test-pages/replace-font-tags/expected.html index 565a446..0f243aa 100644 --- a/test/test-pages/replace-font-tags/expected.html +++ b/test/test-pages/replace-font-tags/expected.html @@ -1,6 +1,7 @@

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Foo

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

\ No newline at end of file diff --git a/test/test-pages/style-tags-removal/expected.html b/test/test-pages/style-tags-removal/expected.html index 6335f30..25d1771 100644 --- a/test/test-pages/style-tags-removal/expected.html +++ b/test/test-pages/style-tags-removal/expected.html @@ -1,6 +1,7 @@

Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

+

Foo

Tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

\ No newline at end of file diff --git a/test/test-pages/tmz-1/expected.html b/test/test-pages/tmz-1/expected.html index bc7242e..56a9446 100644 --- a/test/test-pages/tmz-1/expected.html +++ b/test/test-pages/tmz-1/expected.html @@ -1,6 +1,7 @@

+

Lupita Nyong'o

$150K Pearl Oscar Dress ... STOLEN!!!!

2/26/2015 7:11 AM PST BY TMZ STAFF