Merge pull request #40 from leibovic/byline
Improve byline algorithm. r=Gijs
This commit is contained in:
commit
d0df9d8479
@ -434,6 +434,23 @@ Readability.prototype = {
|
|||||||
return node && node.nextElementSibling;
|
return node && node.nextElementSibling;
|
||||||
},
|
},
|
||||||
|
|
||||||
|
_checkByline: function(node, matchString) {
|
||||||
|
if (this._articleByline) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node.getAttribute !== undefined) {
|
||||||
|
var rel = node.getAttribute("rel");
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((rel === "author" || this.REGEXPS.byline.test(matchString)) && this._isValidByline(node.textContent)) {
|
||||||
|
this._articleByline = node.textContent.trim();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
},
|
||||||
|
|
||||||
/***
|
/***
|
||||||
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
|
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
|
||||||
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
|
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
|
||||||
@ -468,12 +485,11 @@ Readability.prototype = {
|
|||||||
|
|
||||||
while (node) {
|
while (node) {
|
||||||
var matchString = node.className + " " + node.id;
|
var matchString = node.className + " " + node.id;
|
||||||
if (this.REGEXPS.byline.test(matchString) && !this._articleByline) {
|
|
||||||
if (this._isValidByline(node.textContent)) {
|
// Check to see if this node is a byline, and remove it if it is.
|
||||||
this._articleByline = node.textContent.trim();
|
if (this._checkByline(node, matchString)) {
|
||||||
node = this._removeAndGetNext(node);
|
node = this._removeAndGetNext(node);
|
||||||
continue;
|
continue;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Remove unlikely candidates
|
// Remove unlikely candidates
|
||||||
@ -756,19 +772,12 @@ Readability.prototype = {
|
|||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Attempts to get the excerpt from these
|
* Attempts to get excerpt and byline metadata for the article.
|
||||||
* sources in the following order:
|
|
||||||
* - meta description tag
|
|
||||||
* - open-graph description
|
|
||||||
* - twitter cards description
|
|
||||||
* - article's first paragraph
|
|
||||||
* If no excerpt is found, an empty string will be
|
|
||||||
* returned.
|
|
||||||
*
|
*
|
||||||
* @param Element - root element of the processed version page
|
* @return Object with optional "excerpt" and "byline" properties
|
||||||
* @return String - excerpt of the article
|
*/
|
||||||
**/
|
_getArticleMetadata: function() {
|
||||||
_getExcerpt: function(articleContent) {
|
var metadata = {};
|
||||||
var values = {};
|
var values = {};
|
||||||
var metaElements = this._doc.getElementsByTagName("meta");
|
var metaElements = this._doc.getElementsByTagName("meta");
|
||||||
|
|
||||||
@ -785,6 +794,11 @@ Readability.prototype = {
|
|||||||
var elementName = element.getAttribute("name");
|
var elementName = element.getAttribute("name");
|
||||||
var elementProperty = element.getAttribute("property");
|
var elementProperty = element.getAttribute("property");
|
||||||
|
|
||||||
|
if (elementName === "author") {
|
||||||
|
metadata.byline = element.getAttribute("content");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
var name = null;
|
var name = null;
|
||||||
if (namePattern.test(elementName)) {
|
if (namePattern.test(elementName)) {
|
||||||
name = elementName;
|
name = elementName;
|
||||||
@ -804,26 +818,16 @@ Readability.prototype = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if ("description" in values) {
|
if ("description" in values) {
|
||||||
return values["description"];
|
metadata.excerpt = values["description"];
|
||||||
}
|
} else if ("og:description" in values) {
|
||||||
|
|
||||||
if ("og:description" in values) {
|
|
||||||
// Use facebook open graph description.
|
// Use facebook open graph description.
|
||||||
return values["og:description"];
|
metadata.excerpt = values["og:description"];
|
||||||
}
|
} else if ("twitter:description" in values) {
|
||||||
|
|
||||||
if ("twitter:description" in values) {
|
|
||||||
// Use twitter cards description.
|
// Use twitter cards description.
|
||||||
return values["twitter:description"];
|
metadata.excerpt = values["twitter:description"];
|
||||||
}
|
}
|
||||||
|
|
||||||
// No description meta tags, use the article's first paragraph.
|
return metadata;
|
||||||
var paragraphs = articleContent.getElementsByTagName("p");
|
|
||||||
if (paragraphs.length > 0) {
|
|
||||||
return paragraphs[0].textContent;
|
|
||||||
}
|
|
||||||
|
|
||||||
return "";
|
|
||||||
},
|
},
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -1527,6 +1531,8 @@ Readability.prototype = {
|
|||||||
this._prepDocument();
|
this._prepDocument();
|
||||||
|
|
||||||
var articleTitle = this._getArticleTitle();
|
var articleTitle = this._getArticleTitle();
|
||||||
|
var metadata = this._getArticleMetadata();
|
||||||
|
|
||||||
var articleContent = this._grabArticle();
|
var articleContent = this._grabArticle();
|
||||||
if (!articleContent)
|
if (!articleContent)
|
||||||
return null;
|
return null;
|
||||||
@ -1543,14 +1549,22 @@ Readability.prototype = {
|
|||||||
// }).bind(this), 500);
|
// }).bind(this), 500);
|
||||||
// }
|
// }
|
||||||
|
|
||||||
var excerpt = this._getExcerpt(articleContent);
|
// If we haven't found an excerpt in the article's metadata, use the article's
|
||||||
|
// first paragraph as the excerpt. This is used for displaying a preview of
|
||||||
|
// the article's content.
|
||||||
|
if (!metadata.excerpt) {
|
||||||
|
var paragraphs = articleContent.getElementsByTagName("p");
|
||||||
|
if (paragraphs.length > 0) {
|
||||||
|
metadata.excerpt = paragraphs[0].textContent;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return { uri: this._uri,
|
return { uri: this._uri,
|
||||||
title: articleTitle,
|
title: articleTitle,
|
||||||
byline: this._articleByline,
|
byline: metadata.byline || this._articleByline,
|
||||||
dir: this._articleDir,
|
dir: this._articleDir,
|
||||||
content: articleContent.innerHTML,
|
content: articleContent.innerHTML,
|
||||||
length: articleContent.textContent.length,
|
length: articleContent.textContent.length,
|
||||||
excerpt: excerpt };
|
excerpt: metadata.excerpt };
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"title": "This API is so Fetching! ✩ Mozilla Hacks – the Web developer blog",
|
"title": "This API is so Fetching! ✩ Mozilla Hacks – the Web developer blog",
|
||||||
"byline": "blog.nikhilism.com",
|
"byline": "Nikhil Marathe",
|
||||||
"excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ..."
|
"excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ..."
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
{
|
{
|
||||||
"title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt",
|
"title": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt",
|
||||||
"byline": null,
|
"byline": "Henri Sivonen",
|
||||||
"excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet."
|
"excerpt": "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy\n eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam\n voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet\n clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit\n amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam\n nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed\n diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat,\n sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum.\n Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor\n sit amet."
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user