@ -1,7 +1,27 @@
import re
from collections import namedtuple
from lxml . etree import tounicode
from lxml . html import fragment_fromstring
from breadability . document import OriginalDocument
from breadability . utils import cached_property
RegexList = namedtuple ( ' RegexList ' ,
[ ' unlikely ' , ' maybe ' , ' positive ' , ' negative ' ] )
READABLERE = RegexList (
unlikely = ( re . compile (
' combx|comment|community|disqus|extra|foot|header|menu| '
' remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination '
' |pager|popup|tweet|twitter ' , re . I ) ) ,
maybe = ( re . compile ( ' and|article|body|column|main|shadow ' , re . I ) ) ,
positive = ( ) ,
negative = ( )
)
def drop_tag ( doc , * tags ) :
[ [ n . drop_tree ( ) for n in doc . iterfind ( " .// " + tag ) ]
for tag in tags ]
@ -17,10 +37,70 @@ def build_base_document(html):
found_body = html . find ( ' .//body ' )
if found_body is not None :
# remove any CSS and set our own
found_body . set ( ' class ' , ' readabilityBody ' )
found_body . set ( ' id ' , ' readabilityBody ' )
return found_body
def transform_misused_divs_into_paragraphs ( doc ) :
""" Turn all divs that don ' t have children block level elements into p ' s
Since we can ' t change the tree as we iterate over it, we must do this
before we process our document .
The idea is that we process all divs and if the div does not contain
another list of divs , then we replace it with a p tag instead appending
it ' s contents/children to it.
"""
for elem in doc . iter ( tag = ' div ' ) :
child_tags = [ n . tag for n in elem . getchildren ( ) ]
if ' div ' not in child_tags :
# if there is no div inside of this div...then it's a leaf
# node in a sense.
# We need to create a <p> and put all it's contents in there
# We'll just stringify it, then regex replace the first/last
# div bits to turn them into <p> vs <div>.
orig = tounicode ( elem )
started = re . sub ( r ' ^< \ s*div ' , ' <p ' , orig )
ended = re . sub ( r ' div>$ ' , ' p> ' , started )
elem . getparent ( ) . replace ( elem , fragment_fromstring ( ended ) )
return doc
def process ( doc ) :
""" Process this doc to make it readable. """
unlikely = [ ]
scorable_node_tags = [ ' p ' , ' td ' , ' pre ' ]
nodes_to_score = [ ]
def is_unlikely_node ( n ) :
""" Short helper for checking unlikely status. """
if READABLERE . unlikely . match ( nodeid ) :
if not READABLERE . maybe . match ( nodeid ) :
if n . tag != " body " :
return True
for n in doc . getiterator ( ) :
# if the id or clsas show up in the unlikely list, mark for removal
nodeid = " %s %s " % ( n . get ( ' class ' , ' ' ) , n . get ( ' id ' , ' ' ) )
if is_unlikely_node ( n ) :
unlikely . append ( n )
if n . tag in scorable_node_tags :
nodes_to_score . append ( n )
# process our clean up instructions
[ n . drop_tree ( ) for n in unlikely ]
# def transform_misused_divs_into_paragraphs(self):
# for elem in self.html.iter():
# if elem.tag.lower() == "div":
# # transform <div>s that do not contain other block elements into <p>s
# if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
# self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
# elem.tag = "p"
class Article ( object ) :
""" Parsed readable object """
@ -33,5 +113,305 @@ class Article(object):
doc = self . orig . html
doc = build_base_document ( doc )
doc = drop_tag ( doc , ' script ' , ' link ' , ' style ' , ' noscript ' )
doc = transform_misused_divs_into_paragraphs ( doc )
return doc
"""
Algorithm notes for
/ * * *
* grabArticle - Using a variety of metrics ( content score , classname , element types ) , find the content that is
* most likely to be the stuff a user wants to read . Then return it wrapped up in a div .
*
* @param page a document to run upon . Needs to be a full document , complete with body .
* @return Element
* * /
grabArticle : function ( page ) {
var stripUnlikelyCandidates = readability . flagIsActive ( readability . FLAG_STRIP_UNLIKELYS ) ,
isPaging = ( page != = null ) ? true : false ;
page = page ? page : document . body ;
var pageCacheHtml = page . innerHTML ;
var allElements = page . getElementsByTagName ( ' * ' ) ;
/ * *
* First , node prepping . Trash nodes that look cruddy ( like ones with the class name " comment " , etc ) , and turn divs
* into P tags where they have been used inappropriately ( as in , where they contain no other block level elements . )
*
* Note : Assignment from index for performance . See http : / / www . peachpit . com / articles / article . aspx ? p = 31567 & seqNum = 5
* TODO : Shouldn ' t this be a reverse traversal?
* * /
var node = null ;
var nodesToScore = [ ] ;
for ( var nodeIndex = 0 ; ( node = allElements [ nodeIndex ] ) ; nodeIndex + = 1 ) {
/ * Remove unlikely candidates * /
if ( stripUnlikelyCandidates ) {
var unlikelyMatchString = node . className + node . id ;
if (
(
unlikelyMatchString . search ( readability . regexps . unlikelyCandidates ) != = - 1 & &
unlikelyMatchString . search ( readability . regexps . okMaybeItsACandidate ) == = - 1 & &
node . tagName != = " BODY "
)
)
{
dbg ( " Removing unlikely candidate - " + unlikelyMatchString ) ;
node . parentNode . removeChild ( node ) ;
nodeIndex - = 1 ;
continue ;
}
}
if ( node . tagName == = " P " | | node . tagName == = " TD " | | node . tagName == = " PRE " ) {
nodesToScore [ nodesToScore . length ] = node ;
}
/ * Turn all divs that don ' t have children block level elements into p ' s * /
if ( node . tagName == = " DIV " ) {
if ( node . innerHTML . search ( readability . regexps . divToPElements ) == = - 1 ) {
var newNode = document . createElement ( ' p ' ) ;
try {
newNode . innerHTML = node . innerHTML ;
node . parentNode . replaceChild ( newNode , node ) ;
nodeIndex - = 1 ;
nodesToScore [ nodesToScore . length ] = node ;
}
catch ( e ) {
dbg ( " Could not alter div to p, probably an IE restriction, reverting back to div.: " + e ) ;
}
}
else
{
/ * EXPERIMENTAL * /
for ( var i = 0 , il = node . childNodes . length ; i < il ; i + = 1 ) {
var childNode = node . childNodes [ i ] ;
if ( childNode . nodeType == = 3 ) { / / Node . TEXT_NODE
var p = document . createElement ( ' p ' ) ;
p . innerHTML = childNode . nodeValue ;
p . style . display = ' inline ' ;
p . className = ' readability-styled ' ;
childNode . parentNode . replaceChild ( p , childNode ) ;
}
}
}
}
}
/ * *
* Loop through all paragraphs , and assign a score to them based on how content - y they look .
* Then add their score to their parent node .
*
* A score is determined by things like number of commas , class names , etc . Maybe eventually link density .
* * /
var candidates = [ ] ;
for ( var pt = 0 ; pt < nodesToScore . length ; pt + = 1 ) {
var parentNode = nodesToScore [ pt ] . parentNode ;
var grandParentNode = parentNode ? parentNode . parentNode : null ;
var innerText = readability . getInnerText ( nodesToScore [ pt ] ) ;
if ( ! parentNode | | typeof ( parentNode . tagName ) == = ' undefined ' ) {
continue ;
}
/ * If this paragraph is less than 25 characters , don ' t even count it. */
if ( innerText . length < 25 ) {
continue ; }
/ * Initialize readability data for the parent . * /
if ( typeof parentNode . readability == = ' undefined ' ) {
readability . initializeNode ( parentNode ) ;
candidates . push ( parentNode ) ;
}
/ * Initialize readability data for the grandparent . * /
if ( grandParentNode & & typeof ( grandParentNode . readability ) == = ' undefined ' & & typeof ( grandParentNode . tagName ) != = ' undefined ' ) {
readability . initializeNode ( grandParentNode ) ;
candidates . push ( grandParentNode ) ;
}
var contentScore = 0 ;
/ * Add a point for the paragraph itself as a base . * /
contentScore + = 1 ;
/ * Add points for any commas within this paragraph * /
contentScore + = innerText . split ( ' , ' ) . length ;
/ * For every 100 characters in this paragraph , add another point . Up to 3 points . * /
contentScore + = Math . min ( Math . floor ( innerText . length / 100 ) , 3 ) ;
/ * Add the score to the parent . The grandparent gets half . * /
parentNode . readability . contentScore + = contentScore ;
if ( grandParentNode ) {
grandParentNode . readability . contentScore + = contentScore / 2 ;
}
}
/ * *
* After we ' ve calculated scores, loop through all of the possible candidate nodes we found
* and find the one with the highest score .
* * /
var topCandidate = null ;
for ( var c = 0 , cl = candidates . length ; c < cl ; c + = 1 )
{
/ * *
* Scale the final candidates score based on link density . Good content should have a
* relatively small link density ( 5 % or less ) and be mostly unaffected by this operation .
* * /
candidates [ c ] . readability . contentScore = candidates [ c ] . readability . contentScore * ( 1 - readability . getLinkDensity ( candidates [ c ] ) ) ;
dbg ( ' Candidate: ' + candidates [ c ] + " ( " + candidates [ c ] . className + " : " + candidates [ c ] . id + " ) with score " + candidates [ c ] . readability . contentScore ) ;
if ( ! topCandidate | | candidates [ c ] . readability . contentScore > topCandidate . readability . contentScore ) {
topCandidate = candidates [ c ] ; }
}
/ * *
* If we still have no top candidate , just use the body as a last resort .
* We also have to copy the body node so it is something we can modify .
* * /
if ( topCandidate == = null | | topCandidate . tagName == = " BODY " )
{
topCandidate = document . createElement ( " DIV " ) ;
topCandidate . innerHTML = page . innerHTML ;
page . innerHTML = " " ;
page . appendChild ( topCandidate ) ;
readability . initializeNode ( topCandidate ) ;
}
/ * *
* Now that we have the top candidate , look through its siblings for content that might also be related .
* Things like preambles , content split by ads that we removed , etc .
* * /
var articleContent = document . createElement ( " DIV " ) ;
if ( isPaging ) {
articleContent . id = " readability-content " ;
}
var siblingScoreThreshold = Math . max ( 10 , topCandidate . readability . contentScore * 0.2 ) ;
var siblingNodes = topCandidate . parentNode . childNodes ;
for ( var s = 0 , sl = siblingNodes . length ; s < sl ; s + = 1 ) {
var siblingNode = siblingNodes [ s ] ;
var append = false ;
/ * *
* Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList .
* Example of error visible here : http : / / www . esquire . com / features / honesty0707
* * /
if ( ! siblingNode ) {
continue ;
}
dbg ( " Looking at sibling node: " + siblingNode + " ( " + siblingNode . className + " : " + siblingNode . id + " ) " + ( ( typeof siblingNode . readability != = ' undefined ' ) ? ( " with score " + siblingNode . readability . contentScore ) : ' ' ) ) ;
dbg ( " Sibling has score " + ( siblingNode . readability ? siblingNode . readability . contentScore : ' Unknown ' ) ) ;
if ( siblingNode == = topCandidate )
{
append = true ;
}
var contentBonus = 0 ;
/ * Give a bonus if sibling nodes and top candidates have the example same classname * /
if ( siblingNode . className == = topCandidate . className & & topCandidate . className != = " " ) {
contentBonus + = topCandidate . readability . contentScore * 0.2 ;
}
if ( typeof siblingNode . readability != = ' undefined ' & & ( siblingNode . readability . contentScore + contentBonus ) > = siblingScoreThreshold )
{
append = true ;
}
if ( siblingNode . nodeName == = " P " ) {
var linkDensity = readability . getLinkDensity ( siblingNode ) ;
var nodeContent = readability . getInnerText ( siblingNode ) ;
var nodeLength = nodeContent . length ;
if ( nodeLength > 80 & & linkDensity < 0.25 )
{
append = true ;
}
else if ( nodeLength < 80 & & linkDensity == = 0 & & nodeContent . search ( / \. ( | $ ) / ) != = - 1 )
{
append = true ;
}
}
if ( append ) {
dbg ( " Appending node: " + siblingNode ) ;
var nodeToAppend = null ;
if ( siblingNode . nodeName != = " DIV " & & siblingNode . nodeName != = " P " ) {
/ * We have a node that isn ' t a common block level element, like a form or td tag. Turn it into a div so it doesn ' t get filtered out later by accident . * /
dbg ( " Altering siblingNode of " + siblingNode . nodeName + ' to div. ' ) ;
nodeToAppend = document . createElement ( " DIV " ) ;
try {
nodeToAppend . id = siblingNode . id ;
nodeToAppend . innerHTML = siblingNode . innerHTML ;
}
catch ( er ) {
dbg ( " Could not alter siblingNode to div, probably an IE restriction, reverting back to original. " ) ;
nodeToAppend = siblingNode ;
s - = 1 ;
sl - = 1 ;
}
} else {
nodeToAppend = siblingNode ;
s - = 1 ;
sl - = 1 ;
}
/ * To ensure a node does not interfere with readability styles , remove its classnames * /
nodeToAppend . className = " " ;
/ * Append sibling and subtract from our list because it removes the node when you append to another node * /
articleContent . appendChild ( nodeToAppend ) ;
}
}
/ * *
* So we have all of the content that we need . Now we clean it up for presentation .
* * /
readability . prepArticle ( articleContent ) ;
if ( readability . curPageNum == = 1 ) {
articleContent . innerHTML = ' <div id= " readability-page-1 " class= " page " > ' + articleContent . innerHTML + ' </div> ' ;
}
/ * *
* Now that we ' ve gone through the full algorithm, check to see if we got any meaningful content.
* If we didn ' t, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content , and the sieve approach gives us a higher likelihood of
* finding the - right - content .
* * /
if ( readability . getInnerText ( articleContent , false ) . length < 250 ) {
page . innerHTML = pageCacheHtml ;
if ( readability . flagIsActive ( readability . FLAG_STRIP_UNLIKELYS ) ) {
readability . removeFlag ( readability . FLAG_STRIP_UNLIKELYS ) ;
return readability . grabArticle ( page ) ;
}
else if ( readability . flagIsActive ( readability . FLAG_WEIGHT_CLASSES ) ) {
readability . removeFlag ( readability . FLAG_WEIGHT_CLASSES ) ;
return readability . grabArticle ( page ) ;
}
else if ( readability . flagIsActive ( readability . FLAG_CLEAN_CONDITIONALLY ) ) {
readability . removeFlag ( readability . FLAG_CLEAN_CONDITIONALLY ) ;
return readability . grabArticle ( page ) ;
} else {
return null ;
}
}
return articleContent ;
} ,
/ * *
"""