Add processing of content per the algorithm with some base tests

pull/4/merge
Richard Harding 12 years ago
parent 7960264c3b
commit 8f28e7c947

@ -1,3 +1,4 @@
import chardet
import re
from lxml.etree import tostring
from lxml.etree import tounicode
@ -31,6 +32,12 @@ def get_encoding(page):
return enc
def replace_multi_br_to_paragraphs(html):
"""Convert multiple <br>s into paragraphs"""
rep = re.compile("(<br[^>]*>[ \n\r\t]*){2,}", re.I)
return rep.sub('</p><p>', html)
def build_doc(page):
"""Requires that the `page` not be None"""
if page is None:
@ -65,6 +72,7 @@ class OriginalDocument(object):
def _parse(self, html):
"""Generate an lxml document from our html."""
html = replace_multi_br_to_paragraphs(html)
doc = build_doc(html)
# doc = html_cleaner.clean_html(doc)
base_href = self.url

@ -1,7 +1,27 @@
import re
from collections import namedtuple
from lxml.etree import tounicode
from lxml.html import fragment_fromstring
from breadability.document import OriginalDocument
from breadability.utils import cached_property
RegexList = namedtuple('RegexList',
['unlikely', 'maybe', 'positive', 'negative'])
READABLERE = RegexList(
unlikely=(re.compile(
'combx|comment|community|disqus|extra|foot|header|menu|'
'remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination'
'|pager|popup|tweet|twitter', re.I)),
maybe=(re.compile('and|article|body|column|main|shadow', re.I)),
positive=(),
negative=()
)
def drop_tag(doc, *tags):
[[n.drop_tree() for n in doc.iterfind(".//" + tag)]
for tag in tags]
@ -17,10 +37,70 @@ def build_base_document(html):
found_body = html.find('.//body')
if found_body is not None:
# remove any CSS and set our own
found_body.set('class', 'readabilityBody')
found_body.set('id', 'readabilityBody')
return found_body
def transform_misused_divs_into_paragraphs(doc):
"""Turn all divs that don't have children block level elements into p's
Since we can't change the tree as we iterate over it, we must do this
before we process our document.
The idea is that we process all divs and if the div does not contain
another list of divs, then we replace it with a p tag instead appending
it's contents/children to it.
"""
for elem in doc.iter(tag='div'):
child_tags = [n.tag for n in elem.getchildren()]
if 'div' not in child_tags:
# if there is no div inside of this div...then it's a leaf
# node in a sense.
# We need to create a <p> and put all it's contents in there
# We'll just stringify it, then regex replace the first/last
# div bits to turn them into <p> vs <div>.
orig = tounicode(elem)
started = re.sub(r'^<\s*div', '<p', orig)
ended = re.sub(r'div>$', 'p>', started)
elem.getparent().replace(elem, fragment_fromstring(ended))
return doc
def process(doc):
"""Process this doc to make it readable."""
unlikely = []
scorable_node_tags = ['p', 'td', 'pre']
nodes_to_score = []
def is_unlikely_node(n):
"""Short helper for checking unlikely status."""
if READABLERE.unlikely.match(nodeid):
if not READABLERE.maybe.match(nodeid):
if n.tag != "body":
return True
for n in doc.getiterator():
# if the id or clsas show up in the unlikely list, mark for removal
nodeid = "%s%s" % (n.get('class', ''), n.get('id', ''))
if is_unlikely_node(n):
unlikely.append(n)
if n.tag in scorable_node_tags:
nodes_to_score.append(n)
# process our clean up instructions
[n.drop_tree() for n in unlikely]
# def transform_misused_divs_into_paragraphs(self):
# for elem in self.html.iter():
# if elem.tag.lower() == "div":
# # transform <div>s that do not contain other block elements into <p>s
# if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
# self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
# elem.tag = "p"
class Article(object):
"""Parsed readable object"""
@ -33,5 +113,305 @@ class Article(object):
doc = self.orig.html
doc = build_base_document(doc)
doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
doc = transform_misused_divs_into_paragraphs(doc)
return doc
"""
Algorithm notes for
/***
* grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
* most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
*
* @param page a document to run upon. Needs to be a full document, complete with body.
* @return Element
**/
grabArticle: function (page) {
var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
isPaging = (page !== null) ? true: false;
page = page ? page : document.body;
var pageCacheHtml = page.innerHTML;
var allElements = page.getElementsByTagName('*');
/**
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
* into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
*
* Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
* TODO: Shouldn't this be a reverse traversal?
**/
var node = null;
var nodesToScore = [];
for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
/* Remove unlikely candidates */
if (stripUnlikelyCandidates) {
var unlikelyMatchString = node.className + node.id;
if (
(
unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
node.tagName !== "BODY"
)
)
{
dbg("Removing unlikely candidate - " + unlikelyMatchString);
node.parentNode.removeChild(node);
nodeIndex-=1;
continue;
}
}
if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
nodesToScore[nodesToScore.length] = node;
}
/* Turn all divs that don't have children block level elements into p's */
if (node.tagName === "DIV") {
if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
var newNode = document.createElement('p');
try {
newNode.innerHTML = node.innerHTML;
node.parentNode.replaceChild(newNode, node);
nodeIndex-=1;
nodesToScore[nodesToScore.length] = node;
}
catch(e) {
dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
}
}
else
{
/* EXPERIMENTAL */
for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
var childNode = node.childNodes[i];
if(childNode.nodeType === 3) { // Node.TEXT_NODE
var p = document.createElement('p');
p.innerHTML = childNode.nodeValue;
p.style.display = 'inline';
p.className = 'readability-styled';
childNode.parentNode.replaceChild(p, childNode);
}
}
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
*
* A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
**/
var candidates = [];
for (var pt=0; pt < nodesToScore.length; pt+=1) {
var parentNode = nodesToScore[pt].parentNode;
var grandParentNode = parentNode ? parentNode.parentNode : null;
var innerText = readability.getInnerText(nodesToScore[pt]);
if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
continue;
}
/* If this paragraph is less than 25 characters, don't even count it. */
if(innerText.length < 25) {
continue; }
/* Initialize readability data for the parent. */
if(typeof parentNode.readability === 'undefined') {
readability.initializeNode(parentNode);
candidates.push(parentNode);
}
/* Initialize readability data for the grandparent. */
if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
readability.initializeNode(grandParentNode);
candidates.push(grandParentNode);
}
var contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore+=1;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(',').length;
/* For every 100 characters in this paragraph, add another point. Up to 3 points. */
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
/* Add the score to the parent. The grandparent gets half. */
parentNode.readability.contentScore += contentScore;
if(grandParentNode) {
grandParentNode.readability.contentScore += contentScore/2;
}
}
/**
* After we've calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score.
**/
var topCandidate = null;
for(var c=0, cl=candidates.length; c < cl; c+=1)
{
/**
* Scale the final candidates score based on link density. Good content should have a
* relatively small link density (5% or less) and be mostly unaffected by this operation.
**/
candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
topCandidate = candidates[c]; }
}
/**
* If we still have no top candidate, just use the body as a last resort.
* We also have to copy the body node so it is something we can modify.
**/
if (topCandidate === null || topCandidate.tagName === "BODY")
{
topCandidate = document.createElement("DIV");
topCandidate.innerHTML = page.innerHTML;
page.innerHTML = "";
page.appendChild(topCandidate);
readability.initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for content that might also be related.
* Things like preambles, content split by ads that we removed, etc.
**/
var articleContent = document.createElement("DIV");
if (isPaging) {
articleContent.id = "readability-content";
}
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
var siblingNodes = topCandidate.parentNode.childNodes;
for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
var siblingNode = siblingNodes[s];
var append = false;
/**
* Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
* Example of error visible here: http://www.esquire.com/features/honesty0707
**/
if(!siblingNode) {
continue;
}
dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
if(siblingNode === topCandidate)
{
append = true;
}
var contentBonus = 0;
/* Give a bonus if sibling nodes and top candidates have the example same classname */
if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
contentBonus += topCandidate.readability.contentScore * 0.2;
}
if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
{
append = true;
}
if(siblingNode.nodeName === "P") {
var linkDensity = readability.getLinkDensity(siblingNode);
var nodeContent = readability.getInnerText(siblingNode);
var nodeLength = nodeContent.length;
if(nodeLength > 80 && linkDensity < 0.25)
{
append = true;
}
else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
{
append = true;
}
}
if(append) {
dbg("Appending node: " + siblingNode);
var nodeToAppend = null;
if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
/* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
nodeToAppend = document.createElement("DIV");
try {
nodeToAppend.id = siblingNode.id;
nodeToAppend.innerHTML = siblingNode.innerHTML;
}
catch(er) {
dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
nodeToAppend = siblingNode;
s-=1;
sl-=1;
}
} else {
nodeToAppend = siblingNode;
s-=1;
sl-=1;
}
/* To ensure a node does not interfere with readability styles, remove its classnames */
nodeToAppend.className = "";
/* Append sibling and subtract from our list because it removes the node when you append to another node */
articleContent.appendChild(nodeToAppend);
}
}
/**
* So we have all of the content that we need. Now we clean it up for presentation.
**/
readability.prepArticle(articleContent);
if (readability.curPageNum === 1) {
articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
}
/**
* Now that we've gone through the full algorithm, check to see if we got any meaningful content.
* If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of
* finding the -right- content.
**/
if(readability.getInnerText(articleContent, false).length < 250) {
page.innerHTML = pageCacheHtml;
if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
return readability.grabArticle(page);
}
else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
return readability.grabArticle(page);
}
else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
return readability.grabArticle(page);
} else {
return null;
}
}
return articleContent;
},
/**
"""

@ -5,7 +5,7 @@ from breadability.document import OriginalDocument
from breadability.tests import load_snippet
class TestOriginalDocuemtn(TestCase):
class TestOriginalDocument(TestCase):
"""Verify we can process html into a document to work off of."""
@ -37,3 +37,8 @@ class TestOriginalDocuemtn(TestCase):
self.assertEqual(link_counts['blog'], 2)
self.assertEqual(link_counts['other'], 1)
def test_no_br_allowed(self):
"""We convert all <br/> tags to <p> tags"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertIsNone(doc.html.find('.//br'))

@ -1,11 +1,13 @@
from collections import defaultdict
from lxml.etree import tounicode
from lxml.html import document_fromstring
from unittest import TestCase
from breadability.readable import Article
from breadability.readable import transform_misused_divs_into_paragraphs
from breadability.tests import load_snippet
class TestOriginalDocument(TestCase):
class TestReadableDocument(TestCase):
"""Verify we can process html into a document to work off of."""
def test_load_doc(self):
@ -30,7 +32,7 @@ class TestOriginalDocument(TestCase):
"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('class'), 'readabilityBody')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
def test_body_doesnt_exist(self):
"""If we can't find a body, then we create one.
@ -40,7 +42,7 @@ class TestOriginalDocument(TestCase):
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('class'), 'readabilityBody')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
def test_bare_content(self):
"""If the document is just pure content, no html tags we should be ok
@ -50,4 +52,62 @@ class TestOriginalDocument(TestCase):
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('class'), 'readabilityBody')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
class TestCleaning(TestCase):
"""Test out our cleaning processing we do."""
def test_unlikely_hits(self):
"""Verify we wipe out things from our unlikely list."""
doc = Article(load_snippet('test_readable_unlikely.html'))
readable = doc.readable
must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
'popup', 'tweet', 'twitter']
want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
for i in must_not_appear:
# we cannot find any class or id with this value
by_class = readable.find_class(i)
for test in by_class:
# if it's here it cannot have the must not class without the
# want to appear class
found = False
for cls in test.get('class').split():
if cls in want_to_appear:
found = True
self.assertTrue(found)
by_ids = readable.get_element_by_id(i, False)
if by_ids is not False:
found = False
for ids in test.get('id').split():
if ids in want_to_appear:
found = True
self.assertTrue(found)
def test_misused_divs_transform(self):
"""Verify we replace leaf node divs with p's
They should have the same content, just be a p vs a div
"""
test_html = "<html><body><div>simple</div></body></html>"
test_doc = document_fromstring(test_html)
self.assertEqual(
tounicode(
transform_misused_divs_into_paragraphs(test_doc)),
u"<html><body><p>simple</p></body></html>"
)
test_html2 = '<html><body><div>simple<a href="">link</a></div></body></html>'
test_doc2 = document_fromstring(test_html2)
self.assertEqual(
tounicode(
transform_misused_divs_into_paragraphs(test_doc2)),
u'<html><body><p>simple<a href="">link</a></p></body></html>'
)

@ -4,5 +4,11 @@
</head>
<body>
<h1>Min Document</h1>
<p>Testing content</p>
<br /><br />
<div>More content.</div>
<br><br>
<div>Additional content.</div>
<div>Final content.</div>
</body>
</html>

@ -0,0 +1,6 @@
<html>
<head>
<title>Bad Document Title</title>
</head>
<h1>Bad Document</h1>
</html>

@ -0,0 +1,2 @@
<h1>Bad Document</h1>
<p>Some bad content in a document without proper html</p>

@ -0,0 +1,27 @@
<html>
<head>
<title>Min Document Title</title>
</head>
<body>
<h1>Min Document</h1>
<p>Testing content</p>
<!-- This is all stuff that should disappear -->
<div class="comment">Gone</div>
<div id="disqus">Gone</div>
<p id="foot">Gone</div>
<p id="header">Gone</div>
<p class="header">Gone</div>
<div id="header">Gone</div>
<div id="header">Gone</div>
<!-- These have bad and good terms so should stay -->
<p id="mainfoot">Gone</div>
<p id="harticleeader">Gone</div>
<p class="article header">Gone</div>
<p class="column header">Gone</div>
<!-- And this will stick around for final -->
<div>Final content.</div>
</body>
</html>
Loading…
Cancel
Save