From 433195e122fe51665fa09ff785e1f8df4c0233d0 Mon Sep 17 00:00:00 2001 From: Richard Harding Date: Fri, 29 Nov 2013 11:58:34 -0500 Subject: [PATCH] Update sycning with the other branch --- src/breadability/__init__.py | 3 - src/breadability/client.py | 118 ---- src/breadability/readable.py | 508 ------------------ src/breadability/scoring.py | 237 -------- .../test_businessinsider-com/__init__.py | 0 .../test_businessinsider-com/article.html | 0 .../test_businessinsider-com/test.py | 0 .../test_articles/test_sweetshark/__init__.py | 0 .../test_sweetshark/article.html | 0 .../test_articles/test_sweetshark/test.py | 0 10 files changed, 866 deletions(-) delete mode 100644 src/breadability/__init__.py delete mode 100644 src/breadability/client.py delete mode 100644 src/breadability/readable.py delete mode 100644 src/breadability/scoring.py rename {src/breadability/tests => tests}/test_articles/test_businessinsider-com/__init__.py (100%) rename {src/breadability/tests => tests}/test_articles/test_businessinsider-com/article.html (100%) rename {src/breadability/tests => tests}/test_articles/test_businessinsider-com/test.py (100%) rename {src/breadability/tests => tests}/test_articles/test_sweetshark/__init__.py (100%) rename {src/breadability/tests => tests}/test_articles/test_sweetshark/article.html (100%) rename {src/breadability/tests => tests}/test_articles/test_sweetshark/test.py (100%) diff --git a/src/breadability/__init__.py b/src/breadability/__init__.py deleted file mode 100644 index 26ab585..0000000 --- a/src/breadability/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -VERSION = '0.1.14' -import client -from scripts import newtest diff --git a/src/breadability/client.py b/src/breadability/client.py deleted file mode 100644 index 4511240..0000000 --- a/src/breadability/client.py +++ /dev/null @@ -1,118 +0,0 @@ -import argparse -import codecs -import locale -import sys -import urllib -import webbrowser - -from tempfile import mkstemp - -from breadability import VERSION -from breadability.logconfig import LOG -from breadability.logconfig import LNODE -from breadability.logconfig import set_logging_level -from breadability.readable import Article - - -LOGLEVEL = 'WARNING' -USER_AGENT = 'breadability /{version} ({url})'.format( - url="https://github.com/mitechie/breadability", - version=VERSION -) - - -# Setup a user agent on the requests out so that we make servers happy. We use -# a custom agent string to help assist others in identifying the traffic. -class AppURLopener(urllib.FancyURLopener): - version = USER_AGENT -urllib._urlopener = AppURLopener() - - -def parse_args(): - desc = "A fast python port of arc90's readability tool" - parser = argparse.ArgumentParser(description=desc) - parser.add_argument( - '--version', - action='version', version=VERSION) - - parser.add_argument( - '-v', '--verbose', - action='store_true', - default=False, - help='Increase logging verbosity to DEBUG.') - - parser.add_argument( - '-f', '--fragment', - action='store_false', - default=True, - help='Output html fragment by default.') - -# parser.add_argument('-m', '--metadata', -# action='store_true', -# default=False, -# help='print all metadata as well as content for the content') - - parser.add_argument( - '-b', '--browser', - action='store_true', - default=False, - help='open the parsed content in your web browser') - - parser.add_argument( - '-d', '--debug', - action='store_true', - default=False, - help='Output the detailed scoring information for debugging parsing') - - parser.add_argument( - 'path', metavar='P', - type=str, - nargs=1, - help="The url or file path to process in readable form.") - - args = parser.parse_args() - return args - - -def main(): - args = parse_args() - - if args.verbose: - set_logging_level('DEBUG') - - if args.debug: - LNODE.activate() - - target = args.path[0] - LOG.debug("Target: " + target) - - if target.startswith('http') or target.startswith('www'): - is_url = True - url = target - else: - is_url = False - url = None - - if is_url: - req = urllib.urlopen(target) - content = req.read() - ucontent = unicode(content, 'utf-8') - else: - ucontent = codecs.open(target, "r", "utf-8").read() - - doc = Article(ucontent, url=url, fragment=args.fragment) - if args.browser: - fg, pathname = mkstemp(suffix='.html') - out = codecs.open(pathname, 'w', 'utf-8') - out.write(doc.readable) - out.close() - webbrowser.open(pathname) - else: - # Wrap sys.stdout into a StreamWriter to allow writing unicode. - sys.stdout = codecs.getwriter( - locale.getpreferredencoding())(sys.stdout) - sys.stdout.write(doc.readable) - - -if __name__ == '__main__': - main() diff --git a/src/breadability/readable.py b/src/breadability/readable.py deleted file mode 100644 index 05cbe95..0000000 --- a/src/breadability/readable.py +++ /dev/null @@ -1,508 +0,0 @@ -import re -from lxml.etree import tounicode -from lxml.etree import tostring -from lxml.html.clean import Cleaner -from lxml.html import fragment_fromstring -from lxml.html import fromstring -from operator import attrgetter -from pprint import PrettyPrinter - -from breadability.document import OriginalDocument -from breadability.logconfig import LOG -from breadability.logconfig import LNODE -from breadability.scoring import score_candidates -from breadability.scoring import get_link_density -from breadability.scoring import get_class_weight -from breadability.scoring import is_unlikely_node -from breadability.utils import cached_property - - -html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, - style=True, links=True, meta=False, add_nofollow=False, - page_structure=False, processing_instructions=True, - embedded=False, frames=False, forms=False, - annoying_tags=False, remove_tags=None, - remove_unknown_tags=False, safe_attrs_only=False) - - -BASE_DOC = """ - - - - - - - -""" -SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article'] - - -def drop_tag(doc, *tags): - """Helper to just remove any nodes that match this html tag passed in - - :param *tags: one or more html tag strings to remove e.g. style, script - - """ - for tag in tags: - found = doc.iterfind(".//" + tag) - for n in found: - LNODE.log(n, 1, "Dropping tag") - n.drop_tree() - return doc - - -def is_bad_link(a_node): - """Helper to determine if the link is something to clean out - - We've hit articles with many multiple links that should be cleaned out - because they're just there to pollute the space. See tests for examples. - - """ - if a_node.tag == 'a': - name = a_node.get('name') - href = a_node.get('href') - if name and not href: - return True - - if href: - url_bits = href.split('#') - if len(url_bits) == 2: - if len(url_bits[1]) > 25: - return True - return False - - -def ok_embedded_video(node): - """Check if this embed/video is an ok one to count.""" - keep_keywords = ['youtube', 'blip.tv', 'vimeo'] - node_str = tounicode(node) - for key in keep_keywords: - if key in node_str: - return True - return False - - -def build_base_document(html, fragment=True): - """Return a base document with the body as root. - - :param html: Parsed Element object - :param fragment: Should we return a
doc fragment or a full - doc. - - """ - if html.tag == 'body': - html.tag = 'div' - found_body = html - else: - found_body = html.find('.//body') - - if found_body is None: - frag = fragment_fromstring('
') - frag.set('id', 'readabilityBody') - frag.append(html) - - if not fragment: - output = fromstring(BASE_DOC) - insert_point = output.find('.//body') - insert_point.append(frag) - else: - output = frag - else: - - found_body.tag = 'div' - found_body.set('id', 'readabilityBody') - - if not fragment: - output = fromstring(BASE_DOC) - insert_point = output.find('.//body') - insert_point.append(found_body) - else: - output = found_body - - output.doctype = "" - return output - - -def build_error_document(html, fragment=True): - """Return an empty erorr document with the body as root. - - :param fragment: Should we return a
doc fragment or a full - doc. - - """ - frag = fragment_fromstring('
') - frag.set('id', 'readabilityBody') - frag.set('class', 'parsing-error') - - if not fragment: - output = fromstring(BASE_DOC) - insert_point = output.find('.//body') - insert_point.append(frag) - else: - output = frag - - output.doctype = "" - return output - - -def transform_misused_divs_into_paragraphs(doc): - """Turn all divs that don't have children block level elements into p's - - Since we can't change the tree as we iterate over it, we must do this - before we process our document. - - The idea is that we process all divs and if the div does not contain - another list of divs, then we replace it with a p tag instead appending - it's contents/children to it. - - """ - for elem in doc.iter(tag='div'): - child_tags = [n.tag for n in elem.getchildren()] - if 'div' not in child_tags: - # if there is no div inside of this div...then it's a leaf - # node in a sense. - # We need to create a

and put all it's contents in there - # We'll just stringify it, then regex replace the first/last - # div bits to turn them into

vs

. - LNODE.log(elem, 1, 'Turning leaf
into

') - orig = tounicode(elem).strip() - started = re.sub(r'^<\s*div', '$', 'p>', started) - elem.getparent().replace(elem, fromstring(ended)) - return doc - - -def check_siblings(candidate_node, candidate_list): - """Look through siblings for content that might also be related. - - Things like preambles, content split by ads that we removed, etc. - - """ - candidate_css = candidate_node.node.get('class') - potential_target = candidate_node.content_score * 0.2 - sibling_target_score = potential_target if potential_target > 10 else 10 - parent = candidate_node.node.getparent() - siblings = parent.getchildren() if parent is not None else [] - - for sibling in siblings: - append = False - content_bonus = 0 - - if sibling is candidate_node.node: - LNODE.log(sibling, 1, 'Sibling is the node so append') - append = True - - # Give a bonus if sibling nodes and top candidates have the example - # same class name - if candidate_css and sibling.get('class') == candidate_css: - content_bonus += candidate_node.content_score * 0.2 - - if sibling in candidate_list: - adjusted_score = candidate_list[sibling].content_score + \ - content_bonus - - if adjusted_score >= sibling_target_score: - append = True - - if sibling.tag == 'p': - link_density = get_link_density(sibling) - content = sibling.text_content() - content_length = len(content) - - if content_length > 80 and link_density < 0.25: - append = True - elif content_length < 80 and link_density == 0: - if ". " in content: - append = True - - if append: - LNODE.log(sibling, 1, 'Sibling being appended') - if sibling.tag not in ['div', 'p']: - # We have a node that isn't a common block level element, like - # a form or td tag. Turn it into a div so it doesn't get - # filtered out later by accident. - sibling.tag = 'div' - - if candidate_node.node != sibling: - candidate_node.node.append(sibling) - - return candidate_node - - -def clean_document(node): - """Clean up the final document we return as the readable article""" - if node is None or len(node) == 0: - return - - LNODE.log(node, 2, "Processing doc") - clean_list = ['object', 'h1'] - to_drop = [] - - # If there is only one h2, they are probably using it as a header and - # not a subheader, so remove it since we already have a header. - if len(node.findall('.//h2')) == 1: - LOG.debug('Adding H2 to list of nodes to clean.') - clean_list.append('h2') - - for n in node.iter(): - LNODE.log(n, 2, "Cleaning iter node") - # clean out any in-line style properties - if 'style' in n.attrib: - n.set('style', '') - - # remove all of the following tags - # Clean a node of all elements of type "tag". - # (Unless it's a youtube/vimeo video. People love movies.) - is_embed = True if n.tag in ['object', 'embed'] else False - if n.tag in clean_list: - allow = False - - # Allow youtube and vimeo videos through as people usually - # want to see those. - if is_embed: - if ok_embedded_video(n): - allow = True - - if not allow: - LNODE.log(n, 2, "Dropping Node") - to_drop.append(n) - - if n.tag in ['h1', 'h2', 'h3', 'h4']: - # clean headings - # if the heading has no css weight or a high link density, - # remove it - if get_class_weight(n) < 0 or get_link_density(n) > .33: - LNODE.log(n, 2, "Dropping , it's insignificant") - to_drop.append(n) - - # clean out extra

- if n.tag == 'p': - # if the p has no children and has no content...well then down - # with it. - if not n.getchildren() and len(n.text_content()) < 5: - LNODE.log(n, 2, 'Dropping extra

') - to_drop.append(n) - - # finally try out the conditional cleaning of the target node - if clean_conditionally(n): - to_drop.append(n) - - [n.drop_tree() for n in to_drop if n.getparent() is not None] - return node - - -def clean_conditionally(node): - """Remove the clean_el if it looks like bad content based on rules.""" - target_tags = ['form', 'table', 'ul', 'div', 'p'] - - LNODE.log(node, 2, 'Cleaning conditionally node.') - - if node.tag not in target_tags: - # this is not the tag you're looking for - LNODE.log(node, 2, 'Node cleared.') - return - - weight = get_class_weight(node) - # content_score = LOOK up the content score for this node we found - # before else default to 0 - content_score = 0 - - if (weight + content_score < 0): - LNODE.log(node, 2, 'Dropping conditional node') - LNODE.log(node, 2, 'Weight + score < 0') - return True - - if node.text_content().count(',') < 10: - LOG.debug("There aren't 10 ,s so we're processing more") - - # If there are not very many commas, and the number of - # non-paragraph elements is more than paragraphs or other ominous - # signs, remove the element. - p = len(node.findall('.//p')) - img = len(node.findall('.//img')) - li = len(node.findall('.//li')) - 100 - inputs = len(node.findall('.//input')) - - embed = 0 - embeds = node.findall('.//embed') - for e in embeds: - if ok_embedded_video(e): - embed += 1 - link_density = get_link_density(node) - content_length = len(node.text_content()) - - remove_node = False - - if li > p and node.tag != 'ul' and node.tag != 'ol': - LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol') - remove_node = True - elif inputs > p / 3.0: - LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0') - remove_node = True - elif content_length < 25 and (img == 0 or img > 2): - LNODE.log(node, 2, - 'Conditional drop: len < 25 and 0/>2 images') - remove_node = True - elif weight < 25 and link_density > 0.2: - LNODE.log(node, 2, - 'Conditional drop: weight small and link is dense') - remove_node = True - elif weight >= 25 and link_density > 0.5: - LNODE.log(node, 2, - 'Conditional drop: weight big but link heavy') - remove_node = True - elif (embed == 1 and content_length < 75) or embed > 1: - LNODE.log(node, 2, - 'Conditional drop: embed w/o much content or many embed') - remove_node = True - - if remove_node: - LNODE.log(node, 2, 'Node will be removed') - else: - LNODE.log(node, 2, 'Node cleared') - return remove_node - - # nope, don't remove anything - LNODE.log(node, 2, 'Node Cleared final.') - return False - - -def prep_article(doc): - """Once we've found our target article we want to clean it up. - - Clean out: - - inline styles - - forms - - strip empty

- - extra tags - - """ - doc = clean_document(doc) - return doc - - -def find_candidates(doc): - """Find cadidate nodes for the readable version of the article. - - Here's we're going to remove unlikely nodes, find scores on the rest, and - clean up and return the final best match. - - """ - scorable_node_tags = SCORABLE_TAGS - nodes_to_score = [] - should_remove = [] - - for node in doc.iter(): - if is_unlikely_node(node): - LOG.debug('We should drop unlikely: ' + str(node)) - should_remove.append(node) - continue - if node.tag == 'a' and is_bad_link(node): - LOG.debug('We should drop bad link: ' + str(node)) - should_remove.append(node) - continue - if node.tag in scorable_node_tags and node not in nodes_to_score: - nodes_to_score.append(node) - return score_candidates(nodes_to_score), should_remove - - -class Article(object): - """Parsed readable object""" - _should_drop = [] - - def __init__(self, html, url=None, fragment=True): - """Create the Article we're going to use. - - :param html: The string of html we're going to parse. - :param url: The url so we can adjust the links to still work. - :param fragment: Should we return a

fragment or a full - doc. - - """ - LOG.debug('Url: ' + str(url)) - self.orig = OriginalDocument(html, url=url) - self.fragment = fragment - - def __str__(self): - return tostring(self._readable) - - def __unicode__(self): - return tounicode(self._readable) - - @cached_property(ttl=600) - def doc(self): - """The doc is the parsed xml tree of the given html.""" - try: - doc = self.orig.html - # cleaning doesn't return, just wipes in place - html_cleaner(doc) - doc = drop_tag(doc, 'noscript', 'iframe') - doc = transform_misused_divs_into_paragraphs(doc) - return doc - except ValueError: - return None - - @cached_property(ttl=600) - def candidates(self): - """Generate the list of candidates from the doc.""" - doc = self.doc - if doc is not None and len(doc): - candidates, should_drop = find_candidates(doc) - self._should_drop = should_drop - return candidates - else: - return None - - @cached_property(ttl=600) - def readable(self): - return tounicode(self._readable) - - @cached_property(ttl=600) - def _readable(self): - """The readable parsed article""" - if self.candidates: - LOG.debug('Candidates found:') - pp = PrettyPrinter(indent=2) - - # cleanup by removing the should_drop we spotted. - [n.drop_tree() for n in self._should_drop - if n.getparent() is not None] - - # right now we return the highest scoring candidate content - by_score = sorted([c for c in self.candidates.values()], - key=attrgetter('content_score'), reverse=True) - LOG.debug(pp.pformat(by_score)) - - # since we have several candidates, check the winner's siblings - # for extra content - winner = by_score[0] - LOG.debug('Selected winning node: ' + str(winner)) - updated_winner = check_siblings(winner, self.candidates) - LOG.debug('Begin final prep of article') - updated_winner.node = prep_article(updated_winner.node) - if updated_winner.node is not None: - doc = build_base_document(updated_winner.node, self.fragment) - else: - LOG.warning('Had candidates but failed to find a cleaned winning doc.') - doc = self._handle_no_candidates() - else: - LOG.warning('No candidates found: using document.') - LOG.debug('Begin final prep of article') - doc = self._handle_no_candidates() - - return doc - - def _handle_no_candidates(self): - """If we fail to find a good candidate we need to find something else.""" - # since we've not found a good candidate we're should help this - if self.doc is not None and len(self.doc): - # cleanup by removing the should_drop we spotted. - [n.drop_tree() for n in self._should_drop - if n.getparent() is not None] - doc = prep_article(self.doc) - doc = build_base_document(doc, self.fragment) - else: - LOG.warning('No document to use.') - doc = build_error_document(self.fragment) - - return doc diff --git a/src/breadability/scoring.py b/src/breadability/scoring.py deleted file mode 100644 index 941b22b..0000000 --- a/src/breadability/scoring.py +++ /dev/null @@ -1,237 +0,0 @@ -"""Handle dealing with scoring nodes and content for our parsing.""" -import re -from hashlib import md5 -from lxml.etree import tounicode - -from breadability.logconfig import LNODE -from breadability.logconfig import LOG - -# A series of sets of attributes we check to help in determining if a node is -# a potential candidate or not. -CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|' - 'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|' - 'pager|perma|popup|tweet|twitter'), re.I) -CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I) -CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|' - 'page|pagination|post|text|blog|story'), re.I) -CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|' - 'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|' - 'sidebar|sponsor|shopping|tags|tool|widget'), re.I) - - -def check_node_attr(node, attr, checkset): - value = node.get(attr) or "" - check = checkset.search(value) - if check: - return True - else: - return False - - -def generate_hash_id(node): - """Generate a hash_id for the node in question. - - :param node: lxml etree node - - """ - content = tounicode(node) - hashed = md5() - try: - hashed.update(content.encode('utf-8', "replace")) - except Exception, e: - LOG.error("BOOM! " + str(e)) - - return hashed.hexdigest()[0:8] - - -def get_link_density(node, node_text=None): - """Generate a value for the number of links in the node. - - :param node: pared elementree node - :param node_text: if we already have the text_content() make this easier - on us. - :returns float: - - """ - link_length = sum([len(a.text_content()) or 0 - for a in node.findall(".//a")]) - # For each img, give 50 bonus chars worth of length. - # Tweaking this 50 down a notch should help if we hit false positives. - link_length = max(link_length - - sum([50 for img in node.findall(".//img")]), 0) - if node_text: - text_length = len(node_text) - else: - text_length = len(node.text_content()) - return float(link_length) / max(text_length, 1) - - -def get_class_weight(node): - """Get an elements class/id weight. - - We're using sets to help efficiently check for existence of matches. - - """ - weight = 0 - if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE): - weight = weight - 25 - if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE): - weight = weight + 25 - - if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE): - weight = weight - 25 - if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE): - weight = weight + 25 - - return weight - - -def is_unlikely_node(node): - """Short helper for checking unlikely status. - - If the class or id are in the unlikely list, and there's not also a - class/id in the likely list then it might need to be removed. - - """ - unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \ - check_node_attr(node, 'id', CLS_UNLIKELY) - - maybe = check_node_attr(node, 'class', CLS_MAYBE) or \ - check_node_attr(node, 'id', CLS_MAYBE) - - if unlikely and not maybe and node.tag != 'body': - return True - else: - return False - - -def score_candidates(nodes): - """Given a list of potential nodes, find some initial scores to start""" - MIN_HIT_LENTH = 25 - candidates = {} - - for node in nodes: - LNODE.log(node, 1, "Scoring Node") - - content_score = 0 - # if the node has no parent it knows of, then it ends up creating a - # body and html tag to parent the html fragment. - parent = node.getparent() - grand = parent.getparent() if parent is not None else None - innertext = node.text_content() - - if parent is None or grand is None: - LNODE.log( - node, 1, - "Skipping candidate because parent/grand are none") - continue - - # If this paragraph is less than 25 characters, don't even count it. - if innertext and len(innertext) < MIN_HIT_LENTH: - LNODE.log( - node, 1, - "Skipping candidate because not enough content.") - continue - - # Initialize readability data for the parent. - # if the parent node isn't in the candidate list, add it - if parent not in candidates: - candidates[parent] = ScoredNode(parent) - - if grand not in candidates: - candidates[grand] = ScoredNode(grand) - - # Add a point for the paragraph itself as a base. - content_score += 1 - - if innertext: - # Add 0.25 points for any commas within this paragraph - content_score += innertext.count(',') * 0.25 - LNODE.log(node, 1, - "Bonus points for ,: " + str(innertext.count(','))) - - # Subtract 0.5 points for each double quote within this paragraph - content_score += innertext.count('"') * (-0.5) - LNODE.log(node, 1, - 'Penalty points for ": ' + str(innertext.count('"'))) - - # For every 100 characters in this paragraph, add another point. - # Up to 3 points. - length_points = len(innertext) / 100 - - if length_points > 3: - content_score += 3 - else: - content_score += length_points - LNODE.log( - node, 1, - "Length/content points: {0} : {1}".format(length_points, - content_score)) - - # Add the score to the parent. - LNODE.log(node, 1, "From this current node.") - candidates[parent].content_score += content_score - LNODE.log( - candidates[parent].node, - 1, - "Giving parent bonus points: " + str( - candidates[parent].content_score)) - # The grandparent gets half. - LNODE.log(candidates[grand].node, 1, "Giving grand bonus points") - candidates[grand].content_score += (content_score / 2.0) - LNODE.log( - candidates[parent].node, - 1, - "Giving grand bonus points: " + str( - candidates[grand].content_score)) - - for candidate in candidates.values(): - adjustment = 1 - get_link_density(candidate.node) - LNODE.log( - candidate.node, - 1, - "Getting link density adjustment: {0} * {1} ".format( - candidate.content_score, adjustment)) - candidate.content_score = candidate.content_score * (adjustment) - - return candidates - - -class ScoredNode(object): - """We need Scored nodes we use to track possible article matches - - We might have a bunch of these so we use __slots__ to keep memory usage - down. - - """ - __slots__ = ['node', 'content_score'] - - def __repr__(self): - """Helpful representation of our Scored Node""" - return "{0}: {1:0.1F}\t{2}".format( - self.hash_id, - self.content_score, - self.node) - - def __init__(self, node): - """Given node, set an initial score and weigh based on css and id""" - self.node = node - content_score = 0 - if node.tag in ['div', 'article']: - content_score = 5 - - if node.tag in ['pre', 'td', 'blockquote']: - content_score = 3 - - if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li', - 'form']: - content_score = -3 - if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']: - content_score = -5 - - content_score += get_class_weight(node) - self.content_score = content_score - - @property - def hash_id(self): - return generate_hash_id(self.node) diff --git a/src/breadability/tests/test_articles/test_businessinsider-com/__init__.py b/tests/test_articles/test_businessinsider-com/__init__.py similarity index 100% rename from src/breadability/tests/test_articles/test_businessinsider-com/__init__.py rename to tests/test_articles/test_businessinsider-com/__init__.py diff --git a/src/breadability/tests/test_articles/test_businessinsider-com/article.html b/tests/test_articles/test_businessinsider-com/article.html similarity index 100% rename from src/breadability/tests/test_articles/test_businessinsider-com/article.html rename to tests/test_articles/test_businessinsider-com/article.html diff --git a/src/breadability/tests/test_articles/test_businessinsider-com/test.py b/tests/test_articles/test_businessinsider-com/test.py similarity index 100% rename from src/breadability/tests/test_articles/test_businessinsider-com/test.py rename to tests/test_articles/test_businessinsider-com/test.py diff --git a/src/breadability/tests/test_articles/test_sweetshark/__init__.py b/tests/test_articles/test_sweetshark/__init__.py similarity index 100% rename from src/breadability/tests/test_articles/test_sweetshark/__init__.py rename to tests/test_articles/test_sweetshark/__init__.py diff --git a/src/breadability/tests/test_articles/test_sweetshark/article.html b/tests/test_articles/test_sweetshark/article.html similarity index 100% rename from src/breadability/tests/test_articles/test_sweetshark/article.html rename to tests/test_articles/test_sweetshark/article.html diff --git a/src/breadability/tests/test_articles/test_sweetshark/test.py b/tests/test_articles/test_sweetshark/test.py similarity index 100% rename from src/breadability/tests/test_articles/test_sweetshark/test.py rename to tests/test_articles/test_sweetshark/test.py