mirror of
https://github.com/bookieio/breadability
synced 2024-11-18 09:25:29 +00:00
Link density is computed with normalized whitespace
HTML code contains many whitespace and if there is large amount of indentation characters link density is small even if there are only links with usefull text.
This commit is contained in:
parent
671580ac2c
commit
e6191fe0d1
@ -3,6 +3,7 @@
|
||||
"""Handle dealing with scoring nodes and content for our parsing."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division, print_function
|
||||
|
||||
import re
|
||||
import logging
|
||||
@ -10,6 +11,7 @@ import logging
|
||||
from hashlib import md5
|
||||
from lxml.etree import tostring
|
||||
from ._py3k import to_bytes
|
||||
from .utils import normalize_whitespace
|
||||
|
||||
|
||||
# A series of sets of attributes we check to help in determining if a node is
|
||||
@ -76,10 +78,20 @@ def get_link_density(node, node_text=None):
|
||||
this easier on us.
|
||||
:returns float:
|
||||
"""
|
||||
link_length = sum(len(a.text_content()) or 0 for a in node.findall(".//a"))
|
||||
text_length = len(node_text if node_text else node.text_content())
|
||||
if node_text is None:
|
||||
node_text = node.text_content()
|
||||
node_text = normalize_whitespace(node_text.strip())
|
||||
|
||||
return float(link_length) / max(text_length, 1)
|
||||
text_length = len(node_text)
|
||||
if text_length == 0:
|
||||
return 0.0
|
||||
|
||||
links_length = sum(map(_get_normalized_text_length, node.findall(".//a")))
|
||||
return links_length / text_length
|
||||
|
||||
|
||||
def _get_normalized_text_length(node):
|
||||
return len(normalize_whitespace(node.text_content().strip()))
|
||||
|
||||
|
||||
def get_class_weight(node):
|
||||
|
@ -8,7 +8,6 @@ import re
|
||||
from operator import attrgetter
|
||||
from lxml.html import document_fromstring
|
||||
from lxml.html import fragment_fromstring
|
||||
from readability._py3k import to_unicode
|
||||
from readability.readable import Article
|
||||
from readability.scoring import check_node_attributes
|
||||
from readability.scoring import get_class_weight
|
||||
@ -91,20 +90,17 @@ class TestLinkDensity(unittest.TestCase):
|
||||
|
||||
def test_empty_node(self):
|
||||
"""An empty node doesn't have much of a link density"""
|
||||
empty_div = to_unicode("<div></div>")
|
||||
doc = Article(empty_div)
|
||||
assert 0 == get_link_density(doc.readable_dom), "Link density is nadda"
|
||||
doc = Article("<div></div>")
|
||||
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
|
||||
|
||||
def test_small_doc_no_links(self):
|
||||
doc = Article(load_snippet('document_min.html'))
|
||||
assert 0 == get_link_density(doc.readable_dom), "Still no link density"
|
||||
self.assertEqual(get_link_density(doc.readable_dom), 0.0)
|
||||
|
||||
def test_several_links(self):
|
||||
"""This doc has a 3 links with the majority of content."""
|
||||
doc = Article(load_snippet('document_absolute_url.html'))
|
||||
self.assertAlmostEqual(
|
||||
get_link_density(doc.readable_dom), 0.349,
|
||||
places=3)
|
||||
self.assertAlmostEqual(get_link_density(doc.readable_dom), 22/24)
|
||||
|
||||
|
||||
class TestClassWeight(unittest.TestCase):
|
||||
@ -112,9 +108,7 @@ class TestClassWeight(unittest.TestCase):
|
||||
|
||||
def test_no_matches_zero(self):
|
||||
"""If you don't have the attribute then you get a weight of 0"""
|
||||
empty_div = to_unicode("<div></div>")
|
||||
node = fragment_fromstring(empty_div)
|
||||
|
||||
node = fragment_fromstring("<div></div>")
|
||||
self.assertEqual(get_class_weight(node), 0)
|
||||
|
||||
def test_id_hits(self):
|
||||
|
Loading…
Reference in New Issue
Block a user