Update tests for scoring, returning div/html doc depending on the found content

pull/4/merge
Richard Harding 12 years ago
parent 60ab4a96b0
commit 8e96cb7844

@ -1,4 +1,5 @@
import chardet
import logging
import re
from lxml.etree import tostring
from lxml.etree import tounicode
@ -8,6 +9,7 @@ from lxml.html import HTMLParser
from breadability.utils import cached_property
LOG = logging.getLogger(__name__)
utf8_parser = HTMLParser(encoding='utf-8')

@ -1,6 +1,6 @@
import re
from collections import namedtuple
from operator import attrgetter
from lxml.etree import tounicode
from lxml.html import fragment_fromstring
from breadability.document import OriginalDocument
@ -21,6 +21,13 @@ READABLERE = RegexList(
negative=()
)
CLS_WEIGHT_POSITIVE = set(['article', 'body', 'content', 'entry', 'hentry',
'main', 'page', 'pagination', 'post', 'text', 'blog', 'story'])
CLS_WEIGHT_NEGATIVE = set(['combx', 'comment', 'com-', 'contact', 'foot',
'footer', 'footnote', 'masthead', 'media', 'meta', 'outbrain', 'promo',
'related', 'scroll', 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags',
'tool', 'widget'])
def drop_tag(doc, *tags):
[[n.drop_tree() for n in doc.iterfind(".//" + tag)]
@ -35,11 +42,15 @@ def build_base_document(html):
"""
found_body = html.find('.//body')
if found_body is not None:
# remove any CSS and set our own
found_body.set('id', 'readabilityBody')
return found_body
if found_body is None:
fragment = fragment_fromstring('<div/>')
fragment.set('id', 'readabilityBody')
fragment.append(html)
return fragment
else:
found_body.set('id', 'readabilityBody')
return html
def transform_misused_divs_into_paragraphs(doc):
"""Turn all divs that don't have children block level elements into p's
@ -68,6 +79,78 @@ def transform_misused_divs_into_paragraphs(doc):
return doc
###### SCORING
def get_class_weight(node):
"""Get an elements class/id weight.
We're using sets to help efficiently check for existence of matches.
"""
weight = 0
cls = set(node.get('class', default="").split(' '))
ids = node.get('id', default="None")
if cls:
if cls.intersection(CLS_WEIGHT_NEGATIVE):
weight = weight - 25
if cls.intersection(CLS_WEIGHT_POSITIVE):
weight = weight + 25
if ids:
if ids in CLS_WEIGHT_NEGATIVE:
weight = weight - 25
if ids in CLS_WEIGHT_POSITIVE:
weight = weight + 25
return weight
def score_candidates(nodes):
"""Given a list of potential nodes, find some initial scores to start"""
MIN_HIT_LENTH = 25
candidates = {}
for node in nodes:
content_score = 0
parent = node.getparent()
grand = parent.getparent() if parent is not None else None
innertext = node.text
if parent is None or grand is None:
continue
# If this paragraph is less than 25 characters, don't even count it.
if innertext and len(innertext) < MIN_HIT_LENTH:
continue
# Initialize readability data for the parent.
# if the parent node isn't in the candidate list, add it
if parent not in candidates:
candidates[parent] = CandidateNode(parent)
if grand not in candidates:
candidates[grand] = CandidateNode(grand)
# Add a point for the paragraph itself as a base.
content_score += 1;
# Add points for any commas within this paragraph
content_score += innertext.count(',') if innertext else 0
# For every 100 characters in this paragraph, add another point. Up to
# 3 points.
length_points = len(innertext) % 100 if innertext else 0
content_score = length_points if length_points > 3 else 3
# Add the score to the parent. The grandparent gets half. */
if parent is not None:
candidates[parent].content_score += content_score
if grand is not None:
candidates[grand].content_score += content_score
return candidates
def process(doc):
"""Process this doc to make it readable.
@ -83,21 +166,44 @@ def process(doc):
"""Short helper for checking unlikely status."""
if READABLERE.unlikely.match(nodeid):
if not READABLERE.maybe.match(nodeid):
if n.tag != "body":
if node.tag != "body":
return True
for n in doc.getiterator():
for node in doc.getiterator():
# if the id or clsas show up in the unlikely list, mark for removal
nodeid = "%s%s" % (n.get('class', ''), n.get('id', ''))
if is_unlikely_node(n):
unlikely.append(n)
nodeid = "%s%s" % (node.get('class', ''), node.get('id', ''))
if is_unlikely_node(node):
unlikely.append(node)
if n.tag in scorable_node_tags:
nodes_to_score.append(n)
if node.tag in scorable_node_tags:
nodes_to_score.append(node)
# process our clean up instructions
[n.drop_tree() for n in unlikely]
return doc
candidates = score_candidates(nodes_to_score)
return candidates
class CandidateNode(object):
__slots__ = ['node', 'content_score']
def __init__(self, node):
self.node = node
content_score = 0
if node.tag == 'div':
content_score = 5
if node.tag in ['pre', 'td', 'blockquote']:
content_score = 3
if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
'form']:
content_score = -3
if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
content_score = -5
content_score += get_class_weight(node)
self.content_score = content_score
class Article(object):
@ -110,10 +216,19 @@ class Article(object):
def readable(self):
"""The readable parsed article"""
doc = self.orig.html
doc = build_base_document(doc)
doc = drop_tag(doc, 'script', 'link', 'style', 'noscript')
doc = transform_misused_divs_into_paragraphs(doc)
doc = process(doc)
candidates = process(doc)
if candidates:
# right now we return the highest scoring candidate content
by_score = sorted([c for c in candidates.values()],
key=attrgetter('content_score'), reverse=True)
doc = build_base_document(by_score[0].node)
else:
doc = build_base_document(doc)
return doc

@ -7,3 +7,8 @@ TEST_DIR = path.dirname(__file__)
def load_snippet(filename):
"""Helper to fetch in the content of a test snippet"""
return open(path.join(TEST_DIR, 'test_snippets', filename)).read()
def load_article(filename):
"""Helper to fetch in the content of a test article"""
return open(path.join(TEST_DIR, 'test_articles', filename)).read()

@ -1,10 +1,15 @@
from lxml.etree import tounicode
from lxml.html import document_fromstring
from lxml.html import fragment_fromstring
from unittest import TestCase
from breadability.readable import Article
from breadability.readable import CandidateNode
from breadability.readable import get_class_weight
from breadability.readable import score_candidates
from breadability.readable import transform_misused_divs_into_paragraphs
from breadability.tests import load_snippet
from breadability.tests import load_article
class TestReadableDocument(TestCase):
@ -13,8 +18,8 @@ class TestReadableDocument(TestCase):
def test_load_doc(self):
"""We get back an element tree from our original doc"""
doc = Article(load_snippet('document_min.html'))
# We get back the document as a body tag currently by default.
self.assertEqual(doc.readable.tag, 'body')
# We get back the document as a div tag currently by default.
self.assertEqual(doc.readable.tag, 'html')
def test_doc_no_scripts_styles(self):
"""Step #1 remove all scripts from the document"""
@ -31,8 +36,9 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_min.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
self.assertEqual(doc.readable.tag, 'html')
found_body = doc.readable.find('.//body')
self.assertEqual(found_body.get('id'), 'readabilityBody')
def test_body_doesnt_exist(self):
"""If we can't find a body, then we create one.
@ -41,8 +47,9 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_no_body.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
self.assertEqual(doc.readable.tag, 'html')
found_body = doc.readable.find('.//body')
self.assertEqual(found_body.get('id'), 'readabilityBody')
def test_bare_content(self):
"""If the document is just pure content, no html tags we should be ok
@ -51,7 +58,7 @@ class TestReadableDocument(TestCase):
"""
doc = Article(load_snippet('document_only_content.html'))
self.assertEqual(doc.readable.tag, 'body')
self.assertEqual(doc.readable.tag, 'div')
self.assertEqual(doc.readable.get('id'), 'readabilityBody')
@ -111,3 +118,76 @@ class TestCleaning(TestCase):
transform_misused_divs_into_paragraphs(test_doc2)),
u'<html><body><p>simple<a href="">link</a></p></body></html>'
)
class TestCandidateNodes(TestCase):
"""Candidate nodes are scoring containers we use."""
def test_candidate_scores(self):
"""We should be getting back objects with some scores."""
fives = ['<div/>']
threes = ['<pre/>', '<td/>', '<blockquote/>']
neg_threes = ['<address/>', '<ol/>']
neg_fives = ['<h1/>', '<h2/>', '<h3/>', '<h4/>']
for n in fives:
doc = fragment_fromstring(n)
self.assertEqual(CandidateNode(doc).content_score, 5)
for n in threes:
doc = fragment_fromstring(n)
self.assertEqual(CandidateNode(doc).content_score, 3)
for n in neg_threes:
doc = fragment_fromstring(n)
self.assertEqual(CandidateNode(doc).content_score, -3)
for n in neg_fives:
doc = fragment_fromstring(n)
self.assertEqual(CandidateNode(doc).content_score, -5)
class TestClassWeights(TestCase):
"""Certain ids and classes get us bonus points."""
def test_positive_class(self):
"""Some classes get us bonus points."""
node = fragment_fromstring('<p class="article">')
self.assertEqual(get_class_weight(node), 25)
def test_positive_ids(self):
"""Some ids get us bonus points."""
node = fragment_fromstring('<p id="content">')
self.assertEqual(get_class_weight(node), 25)
def test_negative_class(self):
"""Some classes get us negative points."""
node = fragment_fromstring('<p class="comment">')
self.assertEqual(get_class_weight(node), -25)
def test_negative_ids(self):
"""Some ids get us negative points."""
node = fragment_fromstring('<p id="media">')
self.assertEqual(get_class_weight(node), -25)
class TestScoringNodes(TestCase):
"""We take out list of potential nodes and score them up."""
def test_we_get_candidates(self):
"""Processing candidates should get us a list of nodes to try out."""
# we'll start out using our first real test document
test_nodes = []
doc = document_fromstring(load_article('ars/ars.001.html'))
for node in doc.getiterator():
if node.tag in ['p', 'td', 'pre']:
test_nodes.append(node)
candidates = score_candidates(test_nodes)
# this might change as we tweak our algorithm, but if it does change,
# it signifies we need to look at what we changed.
self.assertEqual(len(candidates.keys()), 8)
# one of these should have a decent score
scores = sorted([c.content_score for c in candidates.values()])
self.assertTrue(scores[-1] > 100)

Loading…
Cancel
Save