Add processing of content per the algorithm with some base tests

12 years ago · 8f28e7c947
parent 7960264c3b
commit 8f28e7c947
8 changed files with 501 additions and 7 deletions
--- a/src/breadability/document.py
+++ b/src/breadability/document.py
@ -1,3 +1,4 @@
+import chardet
 import re
 from lxml.etree import tostring
 from lxml.etree import tounicode
@ -31,6 +32,12 @@ def get_encoding(page):
    return enc


+def replace_multi_br_to_paragraphs(html):
+    """Convert multiple <br>s into paragraphs"""
+    rep = re.compile("(<br[^>]*>[ \n\r\t]*){2,}", re.I)
+    return rep.sub('</p><p>', html)
+
+
 def build_doc(page):
    """Requires that the `page` not be None"""
    if page is None:
@ -65,6 +72,7 @@ class OriginalDocument(object):

    def _parse(self, html):
        """Generate an lxml document from our html."""
+        html = replace_multi_br_to_paragraphs(html)
        doc = build_doc(html)
        # doc = html_cleaner.clean_html(doc)
        base_href = self.url
--- a/src/breadability/readable.py
+++ b/src/breadability/readable.py
@ -1,7 +1,27 @@
+import re
+from collections import namedtuple
+
+from lxml.etree import tounicode
+from lxml.html import fragment_fromstring
 from breadability.document import OriginalDocument
 from breadability.utils import cached_property


+RegexList = namedtuple('RegexList',
+    ['unlikely', 'maybe', 'positive', 'negative'])
+
+
+READABLERE = RegexList(
+    unlikely=(re.compile(
+        'combx|comment|community|disqus|extra|foot|header|menu|'
+        'remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination'
+        '|pager|popup|tweet|twitter', re.I)),
+    maybe=(re.compile('and|article|body|column|main|shadow', re.I)),
+    positive=(),
+    negative=()
+)
+
+
 def drop_tag(doc, *tags):
    [[n.drop_tree() for n in doc.iterfind(".//" + tag)]
            for tag in tags]
@ -17,10 +37,70 @@ def build_base_document(html):
    found_body = html.find('.//body')
    if found_body is not None:
        # remove any CSS and set our own
-        found_body.set('class', 'readabilityBody')
+        found_body.set('id', 'readabilityBody')
        return found_body


+def transform_misused_divs_into_paragraphs(doc):
+    """Turn all divs that don't have children block level elements into p's
+
+    Since we can't change the tree as we iterate over it, we must do this
+    before we process our document.
+
+    The idea is that we process all divs and if the div does not contain
+    another list of divs, then we replace it with a p tag instead appending
+    it's contents/children to it.
+
+    """
+    for elem in doc.iter(tag='div'):
+        child_tags = [n.tag for n in elem.getchildren()]
+        if 'div' not in child_tags:
+            # if there is no div inside of this div...then it's a leaf
+            # node in a sense.
+            # We need to create a <p> and put all it's contents in there
+            # We'll just stringify it, then regex replace the first/last
+            # div bits to turn them into <p> vs <div>.
+            orig = tounicode(elem)
+            started = re.sub(r'^<\s*div', '<p', orig)
+            ended = re.sub(r'div>$', 'p>', started)
+            elem.getparent().replace(elem, fragment_fromstring(ended))
+
+    return doc
+
+
+def process(doc):
+    """Process this doc to make it readable."""
+    unlikely = []
+    scorable_node_tags = ['p', 'td', 'pre']
+    nodes_to_score = []
+
+    def is_unlikely_node(n):
+        """Short helper for checking unlikely status."""
+        if READABLERE.unlikely.match(nodeid):
+            if not READABLERE.maybe.match(nodeid):
+                if n.tag != "body":
+                    return True
+
+    for n in doc.getiterator():
+        # if the id or clsas show up in the unlikely list, mark for removal
+        nodeid = "%s%s" % (n.get('class', ''), n.get('id', ''))
+        if is_unlikely_node(n):
+            unlikely.append(n)
+
+        if n.tag in scorable_node_tags:
+            nodes_to_score.append(n)
+
+    # process our clean up instructions
+    [n.drop_tree() for n in unlikely]
+
+# def transform_misused_divs_into_paragraphs(self):
+#     for elem in self.html.iter():
+# if elem.tag.lower() == "div":
+#     # transform <div>s that do not contain other block elements into <p>s
+#     if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
+#         self.debug("Altering div(#%s.%s) to p" % (elem.get('id', ''), elem.get('class', '')))
+#         elem.tag = "p"
+
 class Article(object):
    """Parsed readable object"""

@ -33,5 +113,305 @@ class Article(object):
        doc = self.orig.html
        doc = build_base_document(doc)
        doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
+        doc = transform_misused_divs_into_paragraphs(doc)
        return doc

+
+"""
+Algorithm notes for
+
+
+    /***
+     * grabArticle - Using a variety of metrics (content score, classname, element types), find the content that is
+     *               most likely to be the stuff a user wants to read. Then return it wrapped up in a div.
+     *
+     * @param page a document to run upon. Needs to be a full document, complete with body.
+     * @return Element
+    **/
+    grabArticle: function (page) {
+        var stripUnlikelyCandidates = readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS),
+            isPaging = (page !== null) ? true: false;
+
+        page = page ? page : document.body;
+
+        var pageCacheHtml = page.innerHTML;
+
+        var allElements = page.getElementsByTagName('*');
+
+        /**
+         * First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc), and turn divs
+         * into P tags where they have been used inappropriately (as in, where they contain no other block level elements.)
+         *
+         * Note: Assignment from index for performance. See http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5
+         * TODO: Shouldn't this be a reverse traversal?
+        **/
+        var node = null;
+        var nodesToScore = [];
+        for(var nodeIndex = 0; (node = allElements[nodeIndex]); nodeIndex+=1) {
+            /* Remove unlikely candidates */
+            if (stripUnlikelyCandidates) {
+                var unlikelyMatchString = node.className + node.id;
+                if (
+                    (
+                        unlikelyMatchString.search(readability.regexps.unlikelyCandidates) !== -1 &&
+                        unlikelyMatchString.search(readability.regexps.okMaybeItsACandidate) === -1 &&
+                        node.tagName !== "BODY"
+                    )
+                )
+                {
+                    dbg("Removing unlikely candidate - " + unlikelyMatchString);
+                    node.parentNode.removeChild(node);
+                    nodeIndex-=1;
+                    continue;
+                }
+            }
+
+            if (node.tagName === "P" || node.tagName === "TD" || node.tagName === "PRE") {
+                nodesToScore[nodesToScore.length] = node;
+            }
+
+            /* Turn all divs that don't have children block level elements into p's */
+            if (node.tagName === "DIV") {
+                if (node.innerHTML.search(readability.regexps.divToPElements) === -1) {
+                    var newNode = document.createElement('p');
+                    try {
+                        newNode.innerHTML = node.innerHTML;
+                        node.parentNode.replaceChild(newNode, node);
+                        nodeIndex-=1;
+
+                        nodesToScore[nodesToScore.length] = node;
+                    }
+                    catch(e) {
+                        dbg("Could not alter div to p, probably an IE restriction, reverting back to div.: " + e);
+                    }
+                }
+                else
+                {
+                    /* EXPERIMENTAL */
+                    for(var i = 0, il = node.childNodes.length; i < il; i+=1) {
+                        var childNode = node.childNodes[i];
+                        if(childNode.nodeType === 3) { // Node.TEXT_NODE
+                            var p = document.createElement('p');
+                            p.innerHTML = childNode.nodeValue;
+                            p.style.display = 'inline';
+                            p.className = 'readability-styled';
+                            childNode.parentNode.replaceChild(p, childNode);
+                        }
+                    }
+                }
+            }
+        }
+
+        /**
+         * Loop through all paragraphs, and assign a score to them based on how content-y they look.
+         * Then add their score to their parent node.
+         *
+         * A score is determined by things like number of commas, class names, etc. Maybe eventually link density.
+        **/
+        var candidates = [];
+        for (var pt=0; pt < nodesToScore.length; pt+=1) {
+            var parentNode      = nodesToScore[pt].parentNode;
+            var grandParentNode = parentNode ? parentNode.parentNode : null;
+            var innerText       = readability.getInnerText(nodesToScore[pt]);
+
+            if(!parentNode || typeof(parentNode.tagName) === 'undefined') {
+                continue;
+            }
+
+            /* If this paragraph is less than 25 characters, don't even count it. */
+            if(innerText.length < 25) {
+                continue; }
+
+            /* Initialize readability data for the parent. */
+            if(typeof parentNode.readability === 'undefined') {
+                readability.initializeNode(parentNode);
+                candidates.push(parentNode);
+            }
+
+            /* Initialize readability data for the grandparent. */
+            if(grandParentNode && typeof(grandParentNode.readability) === 'undefined' && typeof(grandParentNode.tagName) !== 'undefined') {
+                readability.initializeNode(grandParentNode);
+                candidates.push(grandParentNode);
+            }
+
+            var contentScore = 0;
+
+            /* Add a point for the paragraph itself as a base. */
+            contentScore+=1;
+
+            /* Add points for any commas within this paragraph */
+            contentScore += innerText.split(',').length;
+
+            /* For every 100 characters in this paragraph, add another point. Up to 3 points. */
+            contentScore += Math.min(Math.floor(innerText.length / 100), 3);
+
+            /* Add the score to the parent. The grandparent gets half. */
+            parentNode.readability.contentScore += contentScore;
+
+            if(grandParentNode) {
+                grandParentNode.readability.contentScore += contentScore/2;
+            }
+        }
+
+        /**
+         * After we've calculated scores, loop through all of the possible candidate nodes we found
+         * and find the one with the highest score.
+        **/
+        var topCandidate = null;
+        for(var c=0, cl=candidates.length; c < cl; c+=1)
+        {
+            /**
+             * Scale the final candidates score based on link density. Good content should have a
+             * relatively small link density (5% or less) and be mostly unaffected by this operation.
+            **/
+            candidates[c].readability.contentScore = candidates[c].readability.contentScore * (1-readability.getLinkDensity(candidates[c]));
+
+            dbg('Candidate: ' + candidates[c] + " (" + candidates[c].className + ":" + candidates[c].id + ") with score " + candidates[c].readability.contentScore);
+
+            if(!topCandidate || candidates[c].readability.contentScore > topCandidate.readability.contentScore) {
+                topCandidate = candidates[c]; }
+        }
+
+        /**
+         * If we still have no top candidate, just use the body as a last resort.
+         * We also have to copy the body node so it is something we can modify.
+         **/
+        if (topCandidate === null || topCandidate.tagName === "BODY")
+        {
+            topCandidate = document.createElement("DIV");
+            topCandidate.innerHTML = page.innerHTML;
+            page.innerHTML = "";
+            page.appendChild(topCandidate);
+            readability.initializeNode(topCandidate);
+        }
+
+        /**
+         * Now that we have the top candidate, look through its siblings for content that might also be related.
+         * Things like preambles, content split by ads that we removed, etc.
+        **/
+        var articleContent        = document.createElement("DIV");
+        if (isPaging) {
+            articleContent.id     = "readability-content";
+        }
+        var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
+        var siblingNodes          = topCandidate.parentNode.childNodes;
+
+
+        for(var s=0, sl=siblingNodes.length; s < sl; s+=1) {
+            var siblingNode = siblingNodes[s];
+            var append      = false;
+
+            /**
+             * Fix for odd IE7 Crash where siblingNode does not exist even though this should be a live nodeList.
+             * Example of error visible here: http://www.esquire.com/features/honesty0707
+            **/
+            if(!siblingNode) {
+                continue;
+            }
+
+            dbg("Looking at sibling node: " + siblingNode + " (" + siblingNode.className + ":" + siblingNode.id + ")" + ((typeof siblingNode.readability !== 'undefined') ? (" with score " + siblingNode.readability.contentScore) : ''));
+            dbg("Sibling has score " + (siblingNode.readability ? siblingNode.readability.contentScore : 'Unknown'));
+
+            if(siblingNode === topCandidate)
+            {
+                append = true;
+            }
+
+            var contentBonus = 0;
+            /* Give a bonus if sibling nodes and top candidates have the example same classname */
+            if(siblingNode.className === topCandidate.className && topCandidate.className !== "") {
+                contentBonus += topCandidate.readability.contentScore * 0.2;
+            }
+
+            if(typeof siblingNode.readability !== 'undefined' && (siblingNode.readability.contentScore+contentBonus) >= siblingScoreThreshold)
+            {
+                append = true;
+            }
+
+            if(siblingNode.nodeName === "P") {
+                var linkDensity = readability.getLinkDensity(siblingNode);
+                var nodeContent = readability.getInnerText(siblingNode);
+                var nodeLength  = nodeContent.length;
+
+                if(nodeLength > 80 && linkDensity < 0.25)
+                {
+                    append = true;
+                }
+                else if(nodeLength < 80 && linkDensity === 0 && nodeContent.search(/\.( |$)/) !== -1)
+                {
+                    append = true;
+                }
+            }
+
+            if(append) {
+                dbg("Appending node: " + siblingNode);
+
+                var nodeToAppend = null;
+                if(siblingNode.nodeName !== "DIV" && siblingNode.nodeName !== "P") {
+                    /* We have a node that isn't a common block level element, like a form or td tag. Turn it into a div so it doesn't get filtered out later by accident. */
+
+                    dbg("Altering siblingNode of " + siblingNode.nodeName + ' to div.');
+                    nodeToAppend = document.createElement("DIV");
+                    try {
+                        nodeToAppend.id = siblingNode.id;
+                        nodeToAppend.innerHTML = siblingNode.innerHTML;
+                    }
+                    catch(er) {
+                        dbg("Could not alter siblingNode to div, probably an IE restriction, reverting back to original.");
+                        nodeToAppend = siblingNode;
+                        s-=1;
+                        sl-=1;
+                    }
+                } else {
+                    nodeToAppend = siblingNode;
+                    s-=1;
+                    sl-=1;
+                }
+
+                /* To ensure a node does not interfere with readability styles, remove its classnames */
+                nodeToAppend.className = "";
+
+                /* Append sibling and subtract from our list because it removes the node when you append to another node */
+                articleContent.appendChild(nodeToAppend);
+            }
+        }
+
+        /**
+         * So we have all of the content that we need. Now we clean it up for presentation.
+        **/
+        readability.prepArticle(articleContent);
+
+        if (readability.curPageNum === 1) {
+            articleContent.innerHTML = '<div id="readability-page-1" class="page">' + articleContent.innerHTML + '</div>';
+        }
+
+        /**
+         * Now that we've gone through the full algorithm, check to see if we got any meaningful content.
+         * If we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
+         * likelihood of finding the content, and the sieve approach gives us a higher likelihood of
+         * finding the -right- content.
+        **/
+        if(readability.getInnerText(articleContent, false).length < 250) {
+        page.innerHTML = pageCacheHtml;
+
+            if (readability.flagIsActive(readability.FLAG_STRIP_UNLIKELYS)) {
+                readability.removeFlag(readability.FLAG_STRIP_UNLIKELYS);
+                return readability.grabArticle(page);
+            }
+            else if (readability.flagIsActive(readability.FLAG_WEIGHT_CLASSES)) {
+                readability.removeFlag(readability.FLAG_WEIGHT_CLASSES);
+                return readability.grabArticle(page);
+            }
+            else if (readability.flagIsActive(readability.FLAG_CLEAN_CONDITIONALLY)) {
+                readability.removeFlag(readability.FLAG_CLEAN_CONDITIONALLY);
+                return readability.grabArticle(page);
+            } else {
+                return null;
+            }
+        }
+
+        return articleContent;
+    },
+
+    /**
+"""
--- a/src/breadability/tests/test_orig_document.py
+++ b/src/breadability/tests/test_orig_document.py
@ -5,7 +5,7 @@ from breadability.document import OriginalDocument
 from breadability.tests import load_snippet


-class TestOriginalDocuemtn(TestCase):
+class TestOriginalDocument(TestCase):

    """Verify we can process html into a document to work off of."""

@ -37,3 +37,8 @@ class TestOriginalDocuemtn(TestCase):

        self.assertEqual(link_counts['blog'], 2)
        self.assertEqual(link_counts['other'], 1)
+
+    def test_no_br_allowed(self):
+        """We convert all <br/> tags to <p> tags"""
+        doc = OriginalDocument(load_snippet('document_min.html'))
+        self.assertIsNone(doc.html.find('.//br'))
--- a/src/breadability/tests/test_readable.py
+++ b/src/breadability/tests/test_readable.py
@ -1,11 +1,13 @@
-from collections import defaultdict
+from lxml.etree import tounicode
+from lxml.html import document_fromstring
 from unittest import TestCase

 from breadability.readable import Article
+from breadability.readable import transform_misused_divs_into_paragraphs
 from breadability.tests import load_snippet


-class TestOriginalDocument(TestCase):
+class TestReadableDocument(TestCase):
    """Verify we can process html into a document to work off of."""

    def test_load_doc(self):
@ -30,7 +32,7 @@ class TestOriginalDocument(TestCase):
        """
        doc = Article(load_snippet('document_min.html'))
        self.assertEqual(doc.readable.tag, 'body')
-        self.assertEqual(doc.readable.get('class'), 'readabilityBody')
+        self.assertEqual(doc.readable.get('id'), 'readabilityBody')

    def test_body_doesnt_exist(self):
        """If we can't find a body, then we create one.
@ -40,7 +42,7 @@ class TestOriginalDocument(TestCase):
        """
        doc = Article(load_snippet('document_no_body.html'))
        self.assertEqual(doc.readable.tag, 'body')
-        self.assertEqual(doc.readable.get('class'), 'readabilityBody')
+        self.assertEqual(doc.readable.get('id'), 'readabilityBody')

    def test_bare_content(self):
        """If the document is just pure content, no html tags we should be ok
@ -50,4 +52,62 @@ class TestOriginalDocument(TestCase):
        """
        doc = Article(load_snippet('document_only_content.html'))
        self.assertEqual(doc.readable.tag, 'body')
-        self.assertEqual(doc.readable.get('class'), 'readabilityBody')
+        self.assertEqual(doc.readable.get('id'), 'readabilityBody')
+
+
+class TestCleaning(TestCase):
+    """Test out our cleaning processing we do."""
+
+    def test_unlikely_hits(self):
+        """Verify we wipe out things from our unlikely list."""
+        doc = Article(load_snippet('test_readable_unlikely.html'))
+        readable = doc.readable
+        must_not_appear = ['comment', 'community', 'disqus', 'extra', 'foot',
+                'header', 'menu', 'remark', 'rss', 'shoutbox', 'sidebar',
+                'sponsor', 'ad-break', 'agegate', 'pagination' '', 'pager',
+                'popup', 'tweet', 'twitter']
+
+        want_to_appear = ['and', 'article', 'body', 'column', 'main', 'shadow']
+
+        for i in must_not_appear:
+            # we cannot find any class or id with this value
+            by_class = readable.find_class(i)
+
+            for test in by_class:
+                # if it's here it cannot have the must not class without the
+                # want to appear class
+                found = False
+                for cls in test.get('class').split():
+                    if cls in want_to_appear:
+                        found = True
+                self.assertTrue(found)
+
+            by_ids = readable.get_element_by_id(i, False)
+            if by_ids is not False:
+                found = False
+                for ids in test.get('id').split():
+                    if ids in want_to_appear:
+                        found = True
+                self.assertTrue(found)
+
+    def test_misused_divs_transform(self):
+        """Verify we replace leaf node divs with p's
+
+        They should have the same content, just be a p vs a div
+
+        """
+        test_html = "<html><body><div>simple</div></body></html>"
+        test_doc = document_fromstring(test_html)
+        self.assertEqual(
+            tounicode(
+                transform_misused_divs_into_paragraphs(test_doc)),
+            u"<html><body><p>simple</p></body></html>"
+        )
+
+        test_html2 = '<html><body><div>simple<a href="">link</a></div></body></html>'
+        test_doc2 = document_fromstring(test_html2)
+        self.assertEqual(
+            tounicode(
+                transform_misused_divs_into_paragraphs(test_doc2)),
+                u'<html><body><p>simple<a href="">link</a></p></body></html>'
+        )
--- a/src/breadability/tests/test_snippets/document_min.html
+++ b/src/breadability/tests/test_snippets/document_min.html
@ -4,5 +4,11 @@
    </head>
    <body>
        <h1>Min Document</h1>
+        <p>Testing content</p>
+        <br /><br />
+        <div>More content.</div>
+        <br><br>
+        <div>Additional content.</div>
+        <div>Final content.</div>
    </body>
 </html>
--- a/src/breadability/tests/test_snippets/document_no_body.html
+++ b/src/breadability/tests/test_snippets/document_no_body.html
@ -0,0 +1,6 @@
+<html>
+    <head>
+        <title>Bad Document Title</title>
+    </head>
+    <h1>Bad Document</h1>
+</html>
--- a/src/breadability/tests/test_snippets/document_only_content.html
+++ b/src/breadability/tests/test_snippets/document_only_content.html
@ -0,0 +1,2 @@
+<h1>Bad Document</h1>
+<p>Some bad content in a document without proper html</p>
--- a/src/breadability/tests/test_snippets/test_readable_unlikely.html
+++ b/src/breadability/tests/test_snippets/test_readable_unlikely.html
@ -0,0 +1,27 @@
+<html>
+    <head>
+        <title>Min Document Title</title>
+    </head>
+    <body>
+        <h1>Min Document</h1>
+        <p>Testing content</p>
+
+        <!-- This is all stuff that should disappear -->
+        <div class="comment">Gone</div>
+        <div id="disqus">Gone</div>
+        <p id="foot">Gone</div>
+        <p id="header">Gone</div>
+        <p class="header">Gone</div>
+        <div id="header">Gone</div>
+        <div id="header">Gone</div>
+
+        <!-- These have bad and good terms so should stay -->
+        <p id="mainfoot">Gone</div>
+        <p id="harticleeader">Gone</div>
+        <p class="article header">Gone</div>
+        <p class="column header">Gone</div>
+
+        <!-- And this will stick around for final -->
+        <div>Final content.</div>
+    </body>
+</html>