From be77f99be150330ae70751acb914a1d3f57f01ca Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Wed, 16 May 2012 21:34:01 -0400 Subject: [PATCH] Add doc and candidates properties to the article --- src/breadability/readable.py | 33 ++++++++++++++++--------- src/breadability/tests/test_readable.py | 6 +++++ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/src/breadability/readable.py b/src/breadability/readable.py index 7e41f5e..e461ddf 100644 --- a/src/breadability/readable.py +++ b/src/breadability/readable.py @@ -383,25 +383,36 @@ class Article(object): return tounicode(self._readable) @cached_property(ttl=600) - def readable(self): - return tounicode(self._readable) - - @cached_property(ttl=600) - def _readable(self): - """The readable parsed article""" + def doc(self): + """The doc is the parsed xml tree of the given html.""" doc = self.orig.html # cleaning doesn't return, just wipes in place html_cleaner(doc) doc = drop_tag(doc, 'noscript', 'iframe') doc = transform_misused_divs_into_paragraphs(doc) + return doc + + @cached_property(ttl=600) + def candidates(self): + """Generate the list of candidates from the doc.""" + doc = self.doc candidates, should_drop = find_candidates(doc) + self._should_drop = should_drop + return candidates + + @cached_property(ttl=600) + def readable(self): + return tounicode(self._readable) - if candidates: + @cached_property(ttl=600) + def _readable(self): + """The readable parsed article""" + if self.candidates: LOG.debug('Candidates found:') pp = PrettyPrinter(indent=2) # right now we return the highest scoring candidate content - by_score = sorted([c for c in candidates.values()], + by_score = sorted([c for c in self.candidates.values()], key=attrgetter('content_score'), reverse=True) LOG.debug(pp.pformat(by_score)) @@ -409,7 +420,7 @@ class Article(object): # for extra content winner = by_score[0] LOG.debug('Selected winning node: ' + str(winner)) - updated_winner = check_siblings(winner, candidates) + updated_winner = check_siblings(winner, self.candidates) LOG.debug('Begin final prep of article') updated_winner.node = prep_article(updated_winner.node) doc = build_base_document(updated_winner.node, self.fragment) @@ -418,8 +429,8 @@ class Article(object): LOG.debug('Begin final prep of article') # since we've not found a good candidate we're should help this # cleanup by removing the should_drop we spotted. - [n.drop_tree() for n in should_drop] - doc = prep_article(doc) + [n.drop_tree() for n in self._should_drop] + doc = prep_article(self.doc) doc = build_base_document(doc, self.fragment) return doc diff --git a/src/breadability/tests/test_readable.py b/src/breadability/tests/test_readable.py index f109112..9508d5a 100644 --- a/src/breadability/tests/test_readable.py +++ b/src/breadability/tests/test_readable.py @@ -147,6 +147,12 @@ class TestCandidateNodes(TestCase): self.assertEqual(ScoredNode(doc).content_score, -5) + def test_article_enables_candidate_access(self): + """Candidates are accessible after document processing.""" + doc = Article(load_article('ars/ars.001.html')) + self.assertTrue(hasattr(doc, 'candidates')) + + class TestClassWeights(TestCase): """Certain ids and classes get us bonus points."""