Improve metadata extraction (#478)

* Improve metadata extraction

* Recognize meta[property] as a space-separated list
* Recognize Dulin Core (dc|dcterm): metadata.
* Prefer Dublin Core, Open Graph, Twitter, and HTML in that order.
* _getArticleTitle() is now only used as fallback if document
 doesn't provide good metadata.
pull/483/head
Daniel Aleksandersen 6 years ago committed by Gijs
parent 0449dbf186
commit 5a69d4a8eb

@ -1198,62 +1198,66 @@ Readability.prototype = {
var values = {};
var metaElements = this._doc.getElementsByTagName("meta");
// Match "description", or Twitter's "twitter:description" (Cards)
// in name attribute.
var namePattern = /^\s*((twitter)\s*:\s*)?(description|title)\s*$/i;
// property is a space-separated list of values
var propertyPattern = /\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title)\s*/gi;
// Match Facebook's Open Graph title & description properties.
var propertyPattern = /^\s*og\s*:\s*(description|title)\s*$/i;
// name is a single value
var namePattern = /^\s*(?:(dc|dcterm|og|twitter)\s*[\.:]\s*)?(author|creator|description|title)\s*$/i;
// Find description tags.
this._forEachNode(metaElements, function(element) {
var elementName = element.getAttribute("name");
var elementProperty = element.getAttribute("property");
var content = element.getAttribute("content");
var matches = null;
var name = null;
if ([elementName, elementProperty].indexOf("author") !== -1) {
metadata.byline = element.getAttribute("content");
return;
if (elementProperty) {
matches = elementProperty.match(propertyPattern);
if (matches) {
for (var i = matches.length - 1; i >= 0; i--) {
// Convert to lowercase, and remove any whitespace
// so we can match below.
name = matches[i].toLowerCase().replace(/\s/g, "");
// multiple authors
values[name] = content.trim();
}
}
}
var name = null;
if (namePattern.test(elementName)) {
if (!matches && elementName && namePattern.test(elementName)) {
name = elementName;
} else if (propertyPattern.test(elementProperty)) {
name = elementProperty;
}
if (name) {
var content = element.getAttribute("content");
if (content) {
// Convert to lowercase and remove any whitespace
// so we can match below.
name = name.toLowerCase().replace(/\s/g, "");
// Convert to lowercase, remove any whitespace, and convert dots
// to colons so we can match below.
name = name.toLowerCase().replace(/\s/g, "").replace(/\./g, ":");
values[name] = content.trim();
}
}
});
if ("description" in values) {
metadata.excerpt = values["description"];
} else if ("og:description" in values) {
// Use facebook open graph description.
metadata.excerpt = values["og:description"];
} else if ("twitter:description" in values) {
// Use twitter cards description.
metadata.excerpt = values["twitter:description"];
}
// get title
metadata.title = values["dc:title"] ||
values["dcterm:title"] ||
values["og:title"] ||
values["title"] ||
values["twitter:title"];
metadata.title = this._getArticleTitle();
if (!metadata.title) {
if ("og:title" in values) {
// Use facebook open graph title.
metadata.title = values["og:title"];
} else if ("twitter:title" in values) {
// Use twitter cards title.
metadata.title = values["twitter:title"];
}
metadata.title = this._getArticleTitle();
}
// get author
metadata.byline = values["dc:creator"] ||
values["dcterm:creator"] ||
values["author"];
// get description
metadata.excerpt = values["dc:description"] ||
values["dcterm:description"] ||
values["og:description"] ||
values["description"] ||
values["twitter:description"];
return metadata;
},

@ -1,5 +1,5 @@
{
"title": "This API is so Fetching! ✩ Mozilla Hacks the Web developer blog",
"title": "This API is so Fetching!",
"byline": "Nikhil Marathe",
"excerpt": "For more than a decade the Web has used XMLHttpRequest (XHR) to achieve asynchronous requests in JavaScript. While very useful, XHR is not a very ...",
"readerable": true

@ -0,0 +1,7 @@
{
"title": "Dublin Core property title",
"byline": "Dublin Core property author",
"dir": null,
"excerpt": "Dublin Core property description",
"readerable": true
}

@ -0,0 +1,20 @@
<div id="readability-page-1" class="page">
<article>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</div>

@ -0,0 +1,45 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Title Element</title>
<meta name="title" content="Meta name title"/>
<meta name="og:title" content="Open Graph name title"/>
<meta name="twitter:title" content="Twitter name title"/>
<meta name="DC.title" content="Dublin Core name title"/>
<meta property="dc:title" content="Dublin Core property title"/>
<meta property="twitter:title" content="Twitter property title"/>
<meta property="og:title" content="Open Graph property title"/>
<meta name="author" content="Meta name author"/>
<meta name="DC.creator" content="Dublin Core name author"/>
<meta property="dc:creator" content="Dublin Core property author"/>
<meta name="description" content="Meta name description"/>
<meta name="og:description" content="Open Graph name description"/>
<meta name="twitter:description" content="Twitter name description"/>
<meta name="DC.description" content="Dublin Core name description"/>
<meta property="dc:description" content="Dublin Core property description"/>
<meta property="twitter:description" content="Twitter property description"/>
<meta property="og:description" content="Open Graph property description"/>
</head>
<body>
<article>
<h1>Test document title</h1>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</body>
</html>

@ -0,0 +1,7 @@
{
"title": "Preferred title",
"byline": "Creator Name",
"dir": null,
"excerpt": "Preferred description",
"readerable": true
}

@ -0,0 +1,20 @@
<div id="readability-page-1" class="page">
<article>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</div>

@ -0,0 +1,35 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Title Element</title>
<meta property="x:title dc:title" content="Preferred title"/>
<meta property="og:title twitter:title" content="A title"/>
<meta property="dc:creator twitter:site_name" content="Creator Name"/>
<meta name="author" content="FAIL"/>
<meta property="og:description x:description twitter:description" content="A description"/>
<meta property="dc:description og:description" content="Preferred description"/>
<meta name="description" content="FAIL"/>
</head>
<body>
<article>
<h1>Test document title</h1>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam,
quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo
consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse
cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non
proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
</article>
</body>
</html>

@ -1,6 +1,6 @@
{
"title": "Obama admits US gun laws are his 'biggest frustration'",
"title": "Obama admits US gun laws are his 'biggest frustration' - BBC News",
"byline": null,
"excerpt": "President Barack Obama tells the BBC his failure to pass",
"excerpt": "President Barack Obama tells the BBC his failure to pass \"common sense gun safety laws\" is the greatest frustration of his presidency.",
"readerable": true
}

@ -1,5 +1,5 @@
{
"title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?'",
"title": "'Neutral' Snopes Fact-Checker David Emery: 'Are There Any Un-Angry Trump Supporters?' - Breitbart",
"byline": "by Lucas Nolan22 Dec 2016651",
"dir": "ltr",
"excerpt": "Snopes fact checker and staff writer David Emery posted to Twitter asking if there were “any un-angry Trump supporters?”",

@ -1,5 +1,5 @@
{
"title": "The seven secrets that hotel owners don't want you to know",
"title": "Seven secrets that hotel owners don't want you to know",
"byline": "Hazel Sheffield",
"dir": null,
"excerpt": "Most people go to hotels for the pleasure of sleeping in a giant bed with clean white sheets and waking up to fresh towels in the morning. But those towels and sheets might not be as clean as they look, according to the hotel bosses that responded to an online thread about the things hotel owners dont want you to know.",

@ -1,6 +1,6 @@
{
"title": "Student Dies After Diet Pills She Bought Online \"Burned Her Up From Within\"",
"byline": "Mark Di Stefano",
"excerpt": "An inquest into Eloise Parry's death has been adjourned until July...",
"byline": null,
"excerpt": "An inquest into Eloise Parry's death has been adjourned until July.",
"readerable": true
}

@ -1,7 +1,7 @@
{
"title": "How to Build a Terrarium (with Pictures)",
"title": "How to Build a Terrarium | eHow",
"byline": "Lucy Akins",
"dir": null,
"excerpt": "How to Build a Terrarium. Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You...",
"excerpt": "Glass cloche terrariums are not only appealing to the eye, but they also preserve a bit of nature in your home and serve as a simple, yet beautiful, piece of art. Closed terrariums are easy to care for, as they retain much of their own moisture and provide a warm environment with a consistent level of humidity. You wont have to water the...",
"readerable": true
}

@ -1,7 +1,7 @@
{
"title": "How to Throw a Graduation Party on a Budget (with Pictures)",
"title": "How to Throw a Graduation Party on a Budget | eHow",
"byline": "Gina Roberts-Grey",
"dir": null,
"excerpt": "How to Throw a Graduation Party on a Budget. Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. Theyre also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food....",
"excerpt": "Graduation parties are a great way to commemorate the years of hard work teens and college co-eds devote to education. Theyre also costly for mom and dad.The average cost of a graduation party in 2013 was a whopping $1,200, according to Graduationparty.com; $700 of that was allocated for food. However that budget was based on Midwestern...",
"readerable": true
}

@ -1,7 +1,7 @@
{
"title": "Xbox One X review: A console that keeps up with gaming PCs",
"title": "Xbox One X review: A console that keeps up with gaming PCs",
"byline": null,
"dir": null,
"excerpt": "The Xbox One X is the ultimate video game system. It sports more horsepower than any system ever. And it plays more titles in native 4K than Sony's PlayStation...",
"excerpt": "The Xbox One X is the most powerful gaming console ever, but it's not for everyone yet.",
"readerable": true
}

@ -1,6 +1,6 @@
{
"title": "1Password für Mac generiert Einmal-Passwörter",
"byline": null,
"byline": "Mac & i",
"excerpt": "Das in der iOS-Version bereits enthaltene TOTP-Feature ist nun auch für OS X 10.10 verfügbar. Zudem gibt es neue Zusatzfelder in der Datenbank und weitere Verbesserungen.",
"readerable": true
}

@ -1,5 +1,5 @@
{
"title": "Angry media wont buckle over new surveillance laws\n\t\t\t\t\t\t| Herald Sun",
"title": "Angry media wont buckle over new surveillance laws",
"byline": "JOE HILDEBRAND",
"dir": null,
"excerpt": "A HIGH-powered federal government team has been doing the rounds of media organisations in the past few days in an attempt to allay concerns about the impact of new surveillance legislation on press freedom. It failed.",

@ -1,5 +1,5 @@
{
"title": "Getting LEAN with Digital Ad UX",
"title": "Getting LEAN with Digital Ad UX | IAB",
"byline": "By\n\t\t\tScott Cunningham",
"excerpt": "We messed up. As technologists, tasked with delivering content and services to users, we lost track of the user experience. Twenty years ago we saw an explosion of websites, built by developers around the world, providing all forms of content. This was the beginning of an age of enlightenment, the intersection of content and technology. … Continued",
"readerable": true

@ -1,5 +1,5 @@
{
"title": "draft-dejong-remotestorage-04 - remoteStorage",
"byline": "AUTHORING",
"title": "remoteStorage",
"byline": "Jong, Michiel de",
"readerable": true
}

@ -1,5 +1,5 @@
{
"title": "Inside the Deep Web Drug Lab — Backchannel — Medium",
"title": "Inside the Deep Web Drug Lab",
"byline": "Joseph Cox",
"excerpt": "Welcome to DoctorXs Barcelona lab, where the drugs you bought online are tested for safety and purity. No questions ask…",
"readerable": true

@ -1,6 +1,6 @@
{
"title": "Una solución no violenta para la cuestión mapuche - 07.12.2017",
"title": "Una solución no violenta para la cuestión mapuche",
"byline": null,
"excerpt": "Una solución no violenta para la cuestión mapuche | Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla - LA NACION",
"excerpt": "Los pueblos indígenas reclaman por derechos que permanecen incumplidos, por eso es más eficiente canalizar la protesta que reprimirla",
"readerable": true
}

@ -1,5 +1,5 @@
{
"title": "Better Student Journalism — Medium",
"title": "The Open Journalism Project: Better Student Journalism",
"byline": "Pippin Lee",
"excerpt": "We pushed out the first version of the Open Journalism site in January. Heres what weve learned about student journali…",
"readerable": true

@ -1,5 +1,5 @@
{
"title": "On Behalf of “Literally” — Medium",
"title": "On Behalf of “Literally”",
"byline": "Courtney Kirchoff",
"excerpt": "In defense of the word “literally” and why you or someone you know should stop misusing the word, lest they drive us fig…",
"readerable": true

@ -1,7 +1,7 @@
{
"title": "Samantha and The Great Big Lie John C. Welch Medium",
"title": "Samantha and The Great Big Lie",
"byline": "John C. Welch",
"dir": null,
"excerpt": "(EDIT: removed the link to Samanthas post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…",
"excerpt": "How to get shanked doing what people say they want",
"readerable": true
}

@ -2216,10 +2216,10 @@
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Samantha and The Great Big Lie Medium</title>
<link rel="canonical" href="https://medium.com/@johncwelch/samantha-and-the-great-big-lie-d146a92473a1" />
<meta name="title" content="Samantha and The Great Big Lie John C. Welch Medium" />
<meta name="title" content="Samantha and The Great Big Lie" />
<meta name="referrer" content="always" />
<meta name="description" content="(EDIT: removed the link to Samanthas post, because the arments and the grubers and the rest of The Deck Clique got what they wanted: a non-proper person driven off the internet lightly capped with a…" />
<meta property="og:title" content="Samantha and The Great Big Lie John C. Welch Medium" />
<meta property="og:title" content="Samantha and The Great Big Lie" />
<meta property="og:url" content="https://medium.com/@johncwelch/samantha-and-the-great-big-lie-d146a92473a1#.h9kzgon9m" />
<meta property="og:image" content="https://cdn-images-1.medium.com/max/1200/1*kbPh7V97eyRodSOw2-ALDw.png" />
<meta property="fb:app_id" content="542599432471018" />

@ -1,5 +1,5 @@
{
"title": "Firefox — Customize and make it your own — The most flexible browser on\n the Web — Mozilla",
"title": "Firefox — Customize and make it your own — The most flexible browser on the Web",
"byline": null,
"dir": "ltr",
"excerpt": "Its easier than ever to personalize Firefox and make it work the way\n you do.\n No other browser gives you so much choice and flexibility.",

@ -1,5 +1,5 @@
{
"title": "The sharing economy is a lie: Uber, Ayn Rand and the truth about tech\n and libertarians",
"title": "The sharing economy is a lie: Uber, Ayn Rand and the truth about tech and libertarians",
"byline": "Joanna Rothkopf",
"excerpt": "Disruptive companies talk a good game about sharing. Uber's really just an under-regulated company making riches",
"readerable": true

@ -1,6 +1,6 @@
{
"title": "Raspberry Pi 3 - The credit card sized PC that cost only $35 - All-time bestselling computer in UK",
"byline": null,
"excerpt": "The Raspberry Pi Foundation started by a handful of volunteers in 2012 when they released the original Raspberry Pi 256MB Model B without knowing what to expect. In a short four-year period they have grown to over sixty full-time employees and ha...",
"excerpt": "The Raspberry Pi Foundation started by a handful of volunteers in 2012 when they released the original Raspberry Pi 256MB Model B without knowing what to expect. In a short four-year period they have grown to over sixty full-time employees and ha...",
"readerable": true
}

@ -1,5 +1,5 @@
{
"title": "Minecraft 1.8 - The Bountiful Update - Minecraft 1.8 - The Bountiful Update",
"title": "Minecraft 1.8 - The Bountiful Update",
"byline": null,
"dir": null,
"excerpt": "+ Added Granite, Andesite, and Diorite stone blocks, with smooth versions\n+ Added Slime Block\n+ Added Iron Trapdoor\n+ Added Prismarine and Sea Lantern blocks\n+ Added the Ocean Monument\n+ Added Red...",

@ -1,5 +1,5 @@
{
"title": "Stack Overflow Jobs Data Shows ReactJS Skills in High Demand, WordPress Market Oversaturated with Developers WordPress Tavern",
"title": "Stack Overflow Jobs Data Shows ReactJS Skills in High Demand, WordPress Market Oversaturated with Developers",
"byline": null,
"dir": null,
"excerpt": "Stack Overflow published its analysis of 2017 hiring trends based on the targeting options employers selected when posting to Stack Overflow Jobs. The report, which compares data from 200 companies…",

@ -1,5 +1,5 @@
{
"title": "Russia: Space ship malfunctions, breaks up over Siberia",
"title": "Yahoo News - Latest News & Headlines",
"byline": "NATALIYA VASILYEVA",
"excerpt": "The latest news and headlines from Yahoo! News. Get breaking news stories and in-depth coverage with videos and photos.",
"readerable": true

@ -2,6 +2,6 @@
"title": "Veteran Wraps Baby in American Flag, Photo Sparks Controversy",
"byline": "By GILLIAN MOHNEY\n March 11, 2015 3:46 PM",
"dir": "ltr",
"excerpt": "From Yahoo: A photographer and Navy veteran is fighting back after a photo she posted to Facebook started an online backlash. Vanessa Hicks said she had no idea her photo would be considered controversial. The photo, from a military familys newborn photo shoot, showed a newborn infant wrapped in an American flag held by his father, who was in his military uniform. Hicks, a Navy veteran herself and the wife of an active-duty Navy member, said her intention was to honor the flag as well as her clients, who wanted to incorporate their military service in the photo shoot.",
"excerpt": "A photographer and Navy veteran is fighting back after a photo she posted to Facebook started an online backlash. Vanessa Hicks said she had no idea her photo would be considered controversial. The photo, from a military familys newborn photo shoot, showed a newborn infant wrapped in an American flag held by his father, who was in his military uniform. Hicks, a Navy veteran herself and the wife of an active-duty Navy member, said her intention was to honor the flag as well as her clients, who wanted to incorporate their military service in the photo shoot.",
"readerable": true
}

@ -1,7 +1,7 @@
{
"title": "トレンドマイクロ、公衆無線LANを安全に使うためのアプリ「フリーWi-Fiプロテクション」 CNET Japan - Yahoo!ニュース",
"title": "トレンドマイクロ、公衆無線LANを安全に使うためのアプリ「フリーWi-Fiプロテクション」CNET Japan - Yahoo!ニュース",
"byline": null,
"dir": null,
"excerpt": "トレンドマイクロは3月9日、Wi-Fi利用時の通信を暗号化し保護するスマホ・タブレッ",
"excerpt": "トレンドマイクロは3月9日、Wi-Fi利用時の通信を暗号化し保護するスマホ・タブレット - Yahoo!ニュース(CNET Japan)",
"readerable": true
}

Loading…
Cancel
Save