Link density is computed with normalized whitespace

HTML code contains many whitespace and if there is
large amount of indentation characters link density
is small even if there are only links with usefull
text.
This commit is contained in:
Mišo Belica 2013-03-26 19:55:18 +01:00
parent 671580ac2c
commit e6191fe0d1
2 changed files with 20 additions and 14 deletions

View File

@ -3,6 +3,7 @@
"""Handle dealing with scoring nodes and content for our parsing.""" """Handle dealing with scoring nodes and content for our parsing."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division, print_function
import re import re
import logging import logging
@ -10,6 +11,7 @@ import logging
from hashlib import md5 from hashlib import md5
from lxml.etree import tostring from lxml.etree import tostring
from ._py3k import to_bytes from ._py3k import to_bytes
from .utils import normalize_whitespace
# A series of sets of attributes we check to help in determining if a node is # A series of sets of attributes we check to help in determining if a node is
@ -76,10 +78,20 @@ def get_link_density(node, node_text=None):
this easier on us. this easier on us.
:returns float: :returns float:
""" """
link_length = sum(len(a.text_content()) or 0 for a in node.findall(".//a")) if node_text is None:
text_length = len(node_text if node_text else node.text_content()) node_text = node.text_content()
node_text = normalize_whitespace(node_text.strip())
return float(link_length) / max(text_length, 1) text_length = len(node_text)
if text_length == 0:
return 0.0
links_length = sum(map(_get_normalized_text_length, node.findall(".//a")))
return links_length / text_length
def _get_normalized_text_length(node):
return len(normalize_whitespace(node.text_content().strip()))
def get_class_weight(node): def get_class_weight(node):

View File

@ -8,7 +8,6 @@ import re
from operator import attrgetter from operator import attrgetter
from lxml.html import document_fromstring from lxml.html import document_fromstring
from lxml.html import fragment_fromstring from lxml.html import fragment_fromstring
from readability._py3k import to_unicode
from readability.readable import Article from readability.readable import Article
from readability.scoring import check_node_attributes from readability.scoring import check_node_attributes
from readability.scoring import get_class_weight from readability.scoring import get_class_weight
@ -91,20 +90,17 @@ class TestLinkDensity(unittest.TestCase):
def test_empty_node(self): def test_empty_node(self):
"""An empty node doesn't have much of a link density""" """An empty node doesn't have much of a link density"""
empty_div = to_unicode("<div></div>") doc = Article("<div></div>")
doc = Article(empty_div) self.assertEqual(get_link_density(doc.readable_dom), 0.0)
assert 0 == get_link_density(doc.readable_dom), "Link density is nadda"
def test_small_doc_no_links(self): def test_small_doc_no_links(self):
doc = Article(load_snippet('document_min.html')) doc = Article(load_snippet('document_min.html'))
assert 0 == get_link_density(doc.readable_dom), "Still no link density" self.assertEqual(get_link_density(doc.readable_dom), 0.0)
def test_several_links(self): def test_several_links(self):
"""This doc has a 3 links with the majority of content.""" """This doc has a 3 links with the majority of content."""
doc = Article(load_snippet('document_absolute_url.html')) doc = Article(load_snippet('document_absolute_url.html'))
self.assertAlmostEqual( self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/24)
get_link_density(doc.readable_dom), 0.349,
places=3)
class TestClassWeight(unittest.TestCase): class TestClassWeight(unittest.TestCase):
@ -112,9 +108,7 @@ class TestClassWeight(unittest.TestCase):
def test_no_matches_zero(self): def test_no_matches_zero(self):
"""If you don't have the attribute then you get a weight of 0""" """If you don't have the attribute then you get a weight of 0"""
empty_div = to_unicode("<div></div>") node = fragment_fromstring("<div></div>")
node = fragment_fromstring(empty_div)
self.assertEqual(get_class_weight(node), 0) self.assertEqual(get_class_weight(node), 0)
def test_id_hits(self): def test_id_hits(self):