breadability/breadability/scoring.py

# -*- coding: utf8 -*-

"""Handle dealing with scoring nodes and content for our parsing."""

from __future__ import absolute_import

import re
import logging

from hashlib import md5
from lxml.etree import tounicode

# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
    'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
    'pager|perma|popup|tweet|twitter'), re.I)
CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
    'page|pagination|post|text|blog|story'), re.I)
CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
    'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
    'sidebar|sponsor|shopping|tags|tool|widget'), re.I)

logger = logging.getLogger("breadability")


def check_node_attr(node, attr, checkset):
    value = node.get(attr) or ""
    check = checkset.search(value)
    if check:
        return True
    else:
        return False


def generate_hash_id(node):
    """Generate a hash_id for the node in question.

    :param node: lxml etree node

    """
    content = tounicode(node)
    hashed = md5()
    try:
        hashed.update(content.encode('utf-8', "replace"))
    except Exception as e:
        logger.exception("BOOM! %r", e)

    return hashed.hexdigest()[0:8]


def get_link_density(node, node_text=None):
    """Generate a value for the number of links in the node.

    :param node: pared elementree node
    :param node_text: if we already have the text_content() make this easier
    on us.
    :returns float:

    """
    link_length = sum([len(a.text_content()) or 0
        for a in node.findall(".//a")])
    if node_text:
        text_length = len(node_text)
    else:
        text_length = len(node.text_content())
    return float(link_length) / max(text_length, 1)


def get_class_weight(node):
    """Get an elements class/id weight.

    We're using sets to help efficiently check for existence of matches.

    """
    weight = 0
    if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
        weight = weight - 25
    if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
        weight = weight + 25

    if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
        weight = weight - 25
    if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
        weight = weight + 25

    return weight


def is_unlikely_node(node):
    """Short helper for checking unlikely status.

    If the class or id are in the unlikely list, and there's not also a
    class/id in the likely list then it might need to be removed.

    """
    unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
        check_node_attr(node, 'id', CLS_UNLIKELY)

    maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
        check_node_attr(node, 'id', CLS_MAYBE)

    if unlikely and not maybe and node.tag != 'body':
        return True
    else:
        return False


def score_candidates(nodes):
    """Given a list of potential nodes, find some initial scores to start"""
    MIN_HIT_LENTH = 25
    candidates = {}

    for node in nodes:
        logger.debug("Scoring Node")

        content_score = 0
        # if the node has no parent it knows of, then it ends up creating a
        # body and html tag to parent the html fragment.
        parent = node.getparent()
        grand = parent.getparent() if parent is not None else None
        innertext = node.text_content()

        if parent is None or grand is None:
            logger.debug("Skipping candidate because parent/grand are none")
            continue

        # If this paragraph is less than 25 characters, don't even count it.
        if innertext and len(innertext) < MIN_HIT_LENTH:
            logger.debug("Skipping candidate because not enough content.")
            continue

        # Initialize readability data for the parent.
        # if the parent node isn't in the candidate list, add it
        if parent not in candidates:
            candidates[parent] = ScoredNode(parent)

        if grand not in candidates:
            candidates[grand] = ScoredNode(grand)

        # Add a point for the paragraph itself as a base.
        content_score += 1

        if innertext:
            # Add 0.25 points for any commas within this paragraph
            content_score += innertext.count(',') * 0.25
            logger.debug("Bonus points for ,: " + str(innertext.count(',')))

            # Subtract 0.5 points for each double quote within this paragraph
            content_score += innertext.count('"') * (-0.5)
            logger.debug('Penalty points for ": ' + str(innertext.count('"')))

            # For every 100 characters in this paragraph, add another point.
            # Up to 3 points.
            length_points = len(innertext) // 100

            if length_points > 3:
                content_score += 3
            else:
                content_score += length_points
            logger.debug("Length/content points: %r : %r", length_points,
                content_score)

        # Add the score to the parent.
        logger.debug("From this current node.")
        candidates[parent].content_score += content_score
        logger.debug("Giving parent bonus points: %r", candidates[parent].content_score)
        # The grandparent gets half.
        logger.debug("Giving grand bonus points")
        candidates[grand].content_score += (content_score / 2.0)
        logger.debug("Giving grand bonus points: %r", candidates[grand].content_score)

    for candidate in candidates.values():
        adjustment = 1 - get_link_density(candidate.node)
        logger.debug("Getting link density adjustment: %r * %r",
            candidate.content_score, adjustment)
        candidate.content_score = candidate.content_score * (adjustment)

    return candidates


class ScoredNode(object):
    """We need Scored nodes we use to track possible article matches

    We might have a bunch of these so we use __slots__ to keep memory usage
    down.

    """
    __slots__ = ['node', 'content_score']

    def __repr__(self):
        """Helpful representation of our Scored Node"""
        return "{0}: {1:0.1F}\t{2}".format(
            self.hash_id,
            self.content_score,
            self.node)

    def __init__(self, node):
        """Given node, set an initial score and weigh based on css and id"""
        self.node = node
        content_score = 0
        if node.tag in ['div', 'article']:
            content_score = 5

        if node.tag in ['pre', 'td', 'blockquote']:
            content_score = 3

        if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
            'form']:
            content_score = -3
        if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
            content_score = -5

        content_score += get_class_weight(node)
        self.content_score = content_score

    @property
    def hash_id(self):
        return generate_hash_id(self.node)
Tests passes for both Python v2.7, v3.3 11 years ago			`# -- coding: utf8 --`

Profile and adjust for performance, add bugfix to parse out mitechie blog post 12 years ago			`"""Handle dealing with scoring nodes and content for our parsing."""`
Tests passes for both Python v2.7, v3.3 11 years ago
			`from __future__ import absolute_import`

Profile and adjust for performance, add bugfix to parse out mitechie blog post 12 years ago			`import re`
Simplify logging 11 years ago			`import logging`
Tests passes for both Python v2.7, v3.3 11 years ago
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`from hashlib import md5`
			`from lxml.etree import tounicode`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago
Some refactoring starts to help us org tests/code 12 years ago			`# A series of sets of attributes we check to help in determining if a node is`
			`# a potential candidate or not.`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`CLS_UNLIKELY = re.compile(('combx\|comment\|community\|disqus\|extra\|foot\|header\|'`
			`'menu\|remark\|rss\|shoutbox\|sidebar\|sponsor\|ad-break\|agegate\|pagination\|'`
Do some link filtring to drop stupid permalinks from the content. 12 years ago			`'pager\|perma\|popup\|tweet\|twitter'), re.I)`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`CLS_MAYBE = re.compile('and\|article\|body\|column\|main\|shadow', re.I)`
			`CLS_WEIGHT_POSITIVE = re.compile(('article\|body\|content\|entry\|hentry\|main\|'`
			`'page\|pagination\|post\|text\|blog\|story'), re.I)`
			`CLS_WEIGHT_NEGATIVE = re.compile(('combx\|comment\|com-\|contact\|foot\|footer\|'`
Update to fix client, add head to the css downgrade weights 12 years ago			`'footnote\|head\|masthead\|media\|meta\|outbrain\|promo\|related\|scroll\|shoutbox\|'`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`'sidebar\|sponsor\|shopping\|tags\|tool\|widget'), re.I)`
Some refactoring starts to help us org tests/code 12 years ago
Simplify logging 11 years ago			`logger = logging.getLogger("breadability")`

Some refactoring starts to help us org tests/code 12 years ago
			`def check_node_attr(node, attr, checkset):`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`value = node.get(attr) or ""`
			`check = checkset.search(value)`
			`if check:`
Some refactoring starts to help us org tests/code 12 years ago			`return True`
			`else:`
			`return False`


Update scoring and tests for the antipope article 12 years ago			`def generate_hash_id(node):`
			`"""Generate a hash_id for the node in question.`

			`:param node: lxml etree node`

			`"""`
			`content = tounicode(node)`
			`hashed = md5()`
			`try:`
Update kwarg for older python 12 years ago			`hashed.update(content.encode('utf-8', "replace"))`
Tests passes for both Python v2.7, v3.3 11 years ago			`except Exception as e:`
Simplify logging 11 years ago			`logger.exception("BOOM! %r", e)`
Update scoring and tests for the antipope article 12 years ago
			`return hashed.hexdigest()[0:8]`


Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`def get_link_density(node, node_text=None):`
Some refactoring starts to help us org tests/code 12 years ago			`"""Generate a value for the number of links in the node.`

			`:param node: pared elementree node`
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`:param node_text: if we already have the text_content() make this easier`
			`on us.`
Some refactoring starts to help us org tests/code 12 years ago			`:returns float:`

			`"""`
Garden and lint 12 years ago			`link_length = sum([len(a.text_content()) or 0`
			`for a in node.findall(".//a")])`
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`if node_text:`
			`text_length = len(node_text)`
			`else:`
			`text_length = len(node.text_content())`
Some refactoring starts to help us org tests/code 12 years ago			`return float(link_length) / max(text_length, 1)`


			`def get_class_weight(node):`
			`"""Get an elements class/id weight.`

			`We're using sets to help efficiently check for existence of matches.`

			`"""`
			`weight = 0`
			`if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):`
			`weight = weight - 25`
			`if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):`
			`weight = weight + 25`

			`if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):`
			`weight = weight - 25`
			`if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):`
			`weight = weight + 25`

			`return weight`


			`def is_unlikely_node(node):`
			`"""Short helper for checking unlikely status.`

			`If the class or id are in the unlikely list, and there's not also a`
			`class/id in the likely list then it might need to be removed.`

			`"""`
			`unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \`
			`check_node_attr(node, 'id', CLS_UNLIKELY)`

			`maybe = check_node_attr(node, 'class', CLS_MAYBE) or \`
			`check_node_attr(node, 'id', CLS_MAYBE)`

			`if unlikely and not maybe and node.tag != 'body':`
			`return True`
			`else:`
			`return False`


			`def score_candidates(nodes):`
			`"""Given a list of potential nodes, find some initial scores to start"""`
			`MIN_HIT_LENTH = 25`
			`candidates = {}`

			`for node in nodes:`
Simplify logging 11 years ago			`logger.debug("Scoring Node")`
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago
Some refactoring starts to help us org tests/code 12 years ago			`content_score = 0`
Started to do some testing, but really not happy with it 12 years ago			`# if the node has no parent it knows of, then it ends up creating a`
			`# body and html tag to parent the html fragment.`
Some refactoring starts to help us org tests/code 12 years ago			`parent = node.getparent()`
			`grand = parent.getparent() if parent is not None else None`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`innertext = node.text_content()`
Some refactoring starts to help us org tests/code 12 years ago
			`if parent is None or grand is None:`
Simplify logging 11 years ago			`logger.debug("Skipping candidate because parent/grand are none")`
Some refactoring starts to help us org tests/code 12 years ago			`continue`

			`# If this paragraph is less than 25 characters, don't even count it.`
			`if innertext and len(innertext) < MIN_HIT_LENTH:`
Simplify logging 11 years ago			`logger.debug("Skipping candidate because not enough content.")`
Some refactoring starts to help us org tests/code 12 years ago			`continue`

			`# Initialize readability data for the parent.`
			`# if the parent node isn't in the candidate list, add it`
			`if parent not in candidates:`
			`candidates[parent] = ScoredNode(parent)`

			`if grand not in candidates:`
			`candidates[grand] = ScoredNode(grand)`

			`# Add a point for the paragraph itself as a base.`
			`content_score += 1`

Add a penalty for double quote chars in paragraphs. - They are far more common in random commented code and proprietary metadata that keeps slipping by the filter as actual content. - Downgraded the score value of commas for the same reason. - Prep for 0.1.10 release with these changes. Add credits and tweak the " and , scoring Update version and update the scoring code 12 years ago			`if innertext:`
			`# Add 0.25 points for any commas within this paragraph`
			`content_score += innertext.count(',') * 0.25`
Simplify logging 11 years ago			`logger.debug("Bonus points for ,: " + str(innertext.count(',')))`
Add a penalty for double quote chars in paragraphs. - They are far more common in random commented code and proprietary metadata that keeps slipping by the filter as actual content. - Downgraded the score value of commas for the same reason. - Prep for 0.1.10 release with these changes. Add credits and tweak the " and , scoring Update version and update the scoring code 12 years ago
			`# Subtract 0.5 points for each double quote within this paragraph`
			`content_score += innertext.count('"') * (-0.5)`
Simplify logging 11 years ago			`logger.debug('Penalty points for ": ' + str(innertext.count('"')))`
Add a penalty for double quote chars in paragraphs. - They are far more common in random commented code and proprietary metadata that keeps slipping by the filter as actual content. - Downgraded the score value of commas for the same reason. - Prep for 0.1.10 release with these changes. Add credits and tweak the " and , scoring Update version and update the scoring code 12 years ago
			`# For every 100 characters in this paragraph, add another point.`
			`# Up to 3 points.`
Tests passes for both Python v2.7, v3.3 11 years ago			`length_points = len(innertext) // 100`
Add a penalty for double quote chars in paragraphs. - They are far more common in random commented code and proprietary metadata that keeps slipping by the filter as actual content. - Downgraded the score value of commas for the same reason. - Prep for 0.1.10 release with these changes. Add credits and tweak the " and , scoring Update version and update the scoring code 12 years ago
			`if length_points > 3:`
			`content_score += 3`
			`else:`
			`content_score += length_points`
Simplify logging 11 years ago			`logger.debug("Length/content points: %r : %r", length_points,`
			`content_score)`
Some refactoring starts to help us org tests/code 12 years ago
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`# Add the score to the parent.`
Simplify logging 11 years ago			`logger.debug("From this current node.")`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`candidates[parent].content_score += content_score`
Simplify logging 11 years ago			`logger.debug("Giving parent bonus points: %r", candidates[parent].content_score)`
Work on tweaking out parser algorithm to help find the right candidate: fixes #2 12 years ago			`# The grandparent gets half.`
Simplify logging 11 years ago			`logger.debug("Giving grand bonus points")`
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`candidates[grand].content_score += (content_score / 2.0)`
Simplify logging 11 years ago			`logger.debug("Giving grand bonus points: %r", candidates[grand].content_score)`
Some refactoring starts to help us org tests/code 12 years ago
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`for candidate in candidates.values():`
Garden and lint 12 years ago			`adjustment = 1 - get_link_density(candidate.node)`
Simplify logging 11 years ago			`logger.debug("Getting link density adjustment: %r * %r",`
			`candidate.content_score, adjustment)`
Garden and lint 12 years ago			`candidate.content_score = candidate.content_score * (adjustment)`
Some refactoring starts to help us org tests/code 12 years ago
			`return candidates`


			`class ScoredNode(object):`
			`"""We need Scored nodes we use to track possible article matches`

			`We might have a bunch of these so we use __slots__ to keep memory usage`
			`down.`

			`"""`
			`__slots__ = ['node', 'content_score']`

Add some more debugging to support tracing wtf we did and why 12 years ago			`def __repr__(self):`
			`"""Helpful representation of our Scored Node"""`
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago			`return "{0}: {1:0.1F}\t{2}".format(`
			`self.hash_id,`
			`self.content_score,`
			`self.node)`
Add some more debugging to support tracing wtf we did and why 12 years ago
Some refactoring starts to help us org tests/code 12 years ago			`def __init__(self, node):`
			`"""Given node, set an initial score and weigh based on css and id"""`
			`self.node = node`
			`content_score = 0`
Add some ScoredNode tests as well 12 years ago			`if node.tag in ['div', 'article']:`
Some refactoring starts to help us org tests/code 12 years ago			`content_score = 5`

			`if node.tag in ['pre', 'td', 'blockquote']:`
			`content_score = 3`

			`if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',`
			`'form']:`
			`content_score = -3`
			`if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:`
			`content_score = -5`
Add some ScoredNode tests as well 12 years ago
Some refactoring starts to help us org tests/code 12 years ago			`content_score += get_class_weight(node)`
			`self.content_score = content_score`
Create LNODE and update bugs in parsing - Add concept of a LNODE logger that outputs information about scoring, node, and generates a hash_id for the node content so we can track it. - Add `-d` flag to the cmd line client to output the LNODE logging - Update reading in of http content in the client to be unicode - Wrap stdout with a unicode happy stream so we can pipe unicode to less/grep, etc - Add html article to the scorable tags we work with - Make sure we drop iframe along with noscript - Fix scoring bugs around length points - Add the hash_id as a scored node @property 12 years ago
			`@property`
			`def hash_id(self):`
Update scoring and tests for the antipope article 12 years ago			`return generate_hash_id(self.node)`