Don't need the old versions any more

11 years ago · 4cbde9cb5a
parent f4fa0c1040
commit 4cbde9cb5a
2 changed files with 0 additions and 745 deletions
--- a/readable.bak.py
+++ b/readable.bak.py
@ -1,508 +0,0 @@
-import re
-from lxml.etree import tounicode
-from lxml.etree import tostring
-from lxml.html.clean import Cleaner
-from lxml.html import fragment_fromstring
-from lxml.html import fromstring
-from operator import attrgetter
-from pprint import PrettyPrinter
-
-from breadability.document import OriginalDocument
-from breadability.logconfig import LOG
-from breadability.logconfig import LNODE
-from breadability.scoring import score_candidates
-from breadability.scoring import get_link_density
-from breadability.scoring import get_class_weight
-from breadability.scoring import is_unlikely_node
-from breadability.utils import cached_property
-
-
-html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,
-                  style=True, links=True, meta=False, add_nofollow=False,
-                  page_structure=False, processing_instructions=True,
-                  embedded=False, frames=False, forms=False,
-                  annoying_tags=False, remove_tags=None,
-                  remove_unknown_tags=False, safe_attrs_only=False)
-
-
-BASE_DOC = """
-<html>
-    <head>
-        <meta http-equiv="Content-Type" content="text/html;charset=UTF-8">
-    </head>
-    <body>
-    </body>
-</html>
-"""
-SCORABLE_TAGS = ['div', 'p', 'td', 'pre', 'article']
-
-
-def drop_tag(doc, *tags):
-    """Helper to just remove any nodes that match this html tag passed in
-
-    :param *tags: one or more html tag strings to remove e.g. style, script
-
-    """
-    for tag in tags:
-        found = doc.iterfind(".//" + tag)
-        for n in found:
-            LNODE.log(n, 1, "Dropping tag")
-            n.drop_tree()
-    return doc
-
-
-def is_bad_link(a_node):
-    """Helper to determine if the link is something to clean out
-
-    We've hit articles with many multiple links that should be cleaned out
-    because they're just there to pollute the space. See tests for examples.
-
-    """
-    if a_node.tag == 'a':
-        name = a_node.get('name')
-        href = a_node.get('href')
-        if name and not href:
-            return True
-
-        if href:
-            url_bits = href.split('#')
-            if len(url_bits) == 2:
-                if len(url_bits[1]) > 25:
-                    return True
-    return False
-
-
-def ok_embedded_video(node):
-    """Check if this embed/video is an ok one to count."""
-    keep_keywords = ['youtube', 'blip.tv', 'vimeo']
-    node_str = tounicode(node)
-    for key in keep_keywords:
-        if key in node_str:
-            return True
-    return False
-
-
-def build_base_document(html, fragment=True):
-    """Return a base document with the body as root.
-
-    :param html: Parsed Element object
-    :param fragment: Should we return a <div> doc fragment or a full <html>
-    doc.
-
-    """
-    if html.tag == 'body':
-        html.tag = 'div'
-        found_body = html
-    else:
-        found_body = html.find('.//body')
-
-    if found_body is None:
-        frag = fragment_fromstring('<div/>')
-        frag.set('id', 'readabilityBody')
-        frag.append(html)
-
-        if not fragment:
-            output = fromstring(BASE_DOC)
-            insert_point = output.find('.//body')
-            insert_point.append(frag)
-        else:
-            output = frag
-    else:
-
-        found_body.tag = 'div'
-        found_body.set('id', 'readabilityBody')
-
-        if not fragment:
-            output = fromstring(BASE_DOC)
-            insert_point = output.find('.//body')
-            insert_point.append(found_body)
-        else:
-            output = found_body
-
-    output.doctype = "<!DOCTYPE html>"
-    return output
-
-
-def build_error_document(html, fragment=True):
-    """Return an empty erorr document with the body as root.
-
-    :param fragment: Should we return a <div> doc fragment or a full <html>
-    doc.
-
-    """
-    frag = fragment_fromstring('<div/>')
-    frag.set('id', 'readabilityBody')
-    frag.set('class', 'parsing-error')
-
-    if not fragment:
-        output = fromstring(BASE_DOC)
-        insert_point = output.find('.//body')
-        insert_point.append(frag)
-    else:
-        output = frag
-
-    output.doctype = "<!DOCTYPE html>"
-    return output
-
-
-def transform_misused_divs_into_paragraphs(doc):
-    """Turn all divs that don't have children block level elements into p's
-
-    Since we can't change the tree as we iterate over it, we must do this
-    before we process our document.
-
-    The idea is that we process all divs and if the div does not contain
-    another list of divs, then we replace it with a p tag instead appending
-    it's contents/children to it.
-
-    """
-    for elem in doc.iter(tag='div'):
-        child_tags = [n.tag for n in elem.getchildren()]
-        if 'div' not in child_tags:
-            # if there is no div inside of this div...then it's a leaf
-            # node in a sense.
-            # We need to create a <p> and put all it's contents in there
-            # We'll just stringify it, then regex replace the first/last
-            # div bits to turn them into <p> vs <div>.
-            LNODE.log(elem, 1, 'Turning leaf <div> into <p>')
-            orig = tounicode(elem).strip()
-            started = re.sub(r'^<\s*div', '<p', orig)
-            ended = re.sub(r'div>$', 'p>', started)
-            elem.getparent().replace(elem, fromstring(ended))
-    return doc
-
-
-def check_siblings(candidate_node, candidate_list):
-    """Look through siblings for content that might also be related.
-
-    Things like preambles, content split by ads that we removed, etc.
-
-    """
-    candidate_css = candidate_node.node.get('class')
-    potential_target = candidate_node.content_score * 0.2
-    sibling_target_score = potential_target if potential_target > 10 else 10
-    parent = candidate_node.node.getparent()
-    siblings = parent.getchildren() if parent is not None else []
-
-    for sibling in siblings:
-        append = False
-        content_bonus = 0
-
-        if sibling is candidate_node.node:
-            LNODE.log(sibling, 1, 'Sibling is the node so append')
-            append = True
-
-        # Give a bonus if sibling nodes and top candidates have the example
-        # same class name
-        if candidate_css and sibling.get('class') == candidate_css:
-            content_bonus += candidate_node.content_score * 0.2
-
-        if sibling in candidate_list:
-            adjusted_score = candidate_list[sibling].content_score + \
-                content_bonus
-
-            if adjusted_score >= sibling_target_score:
-                append = True
-
-        if sibling.tag == 'p':
-            link_density = get_link_density(sibling)
-            content = sibling.text_content()
-            content_length = len(content)
-
-            if content_length > 80 and link_density < 0.25:
-                append = True
-            elif content_length < 80 and link_density == 0:
-                if ". " in content:
-                    append = True
-
-        if append:
-            LNODE.log(sibling, 1, 'Sibling being appended')
-            if sibling.tag not in ['div', 'p']:
-                # We have a node that isn't a common block level element, like
-                # a form or td tag. Turn it into a div so it doesn't get
-                # filtered out later by accident.
-                sibling.tag = 'div'
-
-            if candidate_node.node != sibling:
-                candidate_node.node.append(sibling)
-
-    return candidate_node
-
-
-def clean_document(node):
-    """Clean up the final document we return as the readable article"""
-    if node is None or len(node) == 0:
-        return
-
-    LNODE.log(node, 2, "Processing doc")
-    clean_list = ['object', 'h1']
-    to_drop = []
-
-    # If there is only one h2, they are probably using it as a header and
-    # not a subheader, so remove it since we already have a header.
-    if len(node.findall('.//h2')) == 1:
-        LOG.debug('Adding H2 to list of nodes to clean.')
-        clean_list.append('h2')
-
-    for n in node.iter():
-        LNODE.log(n, 2, "Cleaning iter node")
-        # clean out any in-line style properties
-        if 'style' in n.attrib:
-            n.set('style', '')
-
-        # remove all of the following tags
-        # Clean a node of all elements of type "tag".
-        # (Unless it's a youtube/vimeo video. People love movies.)
-        is_embed = True if n.tag in ['object', 'embed'] else False
-        if n.tag in clean_list:
-            allow = False
-
-            # Allow youtube and vimeo videos through as people usually
-            # want to see those.
-            if is_embed:
-                if ok_embedded_video(n):
-                    allow = True
-
-            if not allow:
-                LNODE.log(n, 2, "Dropping Node")
-                to_drop.append(n)
-
-        if n.tag in ['h1', 'h2', 'h3', 'h4']:
-            # clean headings
-            # if the heading has no css weight or a high link density,
-            # remove it
-            if get_class_weight(n) < 0 or get_link_density(n) > .33:
-                LNODE.log(n, 2, "Dropping <hX>, it's insignificant")
-                to_drop.append(n)
-
-        # clean out extra <p>
-        if n.tag == 'p':
-            # if the p has no children and has no content...well then down
-            # with it.
-            if not n.getchildren() and len(n.text_content()) < 5:
-                LNODE.log(n, 2, 'Dropping extra <p>')
-                to_drop.append(n)
-
-        # finally try out the conditional cleaning of the target node
-        if clean_conditionally(n):
-            to_drop.append(n)
-
-    [n.drop_tree() for n in to_drop if n.getparent() is not None]
-    return node
-
-
-def clean_conditionally(node):
-    """Remove the clean_el if it looks like bad content based on rules."""
-    target_tags = ['form', 'table', 'ul', 'div', 'p']
-
-    LNODE.log(node, 2, 'Cleaning conditionally node.')
-
-    if node.tag not in target_tags:
-        # this is not the tag you're looking for
-        LNODE.log(node, 2, 'Node cleared.')
-        return
-
-    weight = get_class_weight(node)
-    # content_score = LOOK up the content score for this node we found
-    # before else default to 0
-    content_score = 0
-
-    if (weight + content_score < 0):
-        LNODE.log(node, 2, 'Dropping conditional node')
-        LNODE.log(node, 2, 'Weight + score < 0')
-        return True
-
-    if node.text_content().count(',') < 10:
-        LOG.debug("There aren't 10 ,s so we're processing more")
-
-        # If there are not very many commas, and the number of
-        # non-paragraph elements is more than paragraphs or other ominous
-        # signs, remove the element.
-        p = len(node.findall('.//p'))
-        img = len(node.findall('.//img'))
-        li = len(node.findall('.//li')) - 100
-        inputs = len(node.findall('.//input'))
-
-        embed = 0
-        embeds = node.findall('.//embed')
-        for e in embeds:
-            if ok_embedded_video(e):
-                embed += 1
-        link_density = get_link_density(node)
-        content_length = len(node.text_content())
-
-        remove_node = False
-
-        if li > p and node.tag != 'ul' and node.tag != 'ol':
-            LNODE.log(node, 2, 'Conditional drop: li > p and not ul/ol')
-            remove_node = True
-        elif inputs > p / 3.0:
-            LNODE.log(node, 2, 'Conditional drop: inputs > p/3.0')
-            remove_node = True
-        elif content_length < 25 and (img == 0 or img > 2):
-            LNODE.log(node, 2,
-                'Conditional drop: len < 25 and 0/>2 images')
-            remove_node = True
-        elif weight < 25 and link_density > 0.2:
-            LNODE.log(node, 2,
-                'Conditional drop: weight small and link is dense')
-            remove_node = True
-        elif weight >= 25 and link_density > 0.5:
-            LNODE.log(node, 2,
-                'Conditional drop: weight big but link heavy')
-            remove_node = True
-        elif (embed == 1 and content_length < 75) or embed > 1:
-            LNODE.log(node, 2,
-                'Conditional drop: embed w/o much content or many embed')
-            remove_node = True
-
-        if remove_node:
-            LNODE.log(node, 2, 'Node will be removed')
-        else:
-            LNODE.log(node, 2, 'Node cleared')
-        return remove_node
-
-    # nope, don't remove anything
-    LNODE.log(node, 2, 'Node Cleared final.')
-    return False
-
-
-def prep_article(doc):
-    """Once we've found our target article we want to clean it up.
-
-    Clean out:
-    - inline styles
-    - forms
-    - strip empty <p>
-    - extra tags
-
-    """
-    doc = clean_document(doc)
-    return doc
-
-
-def find_candidates(doc):
-    """Find cadidate nodes for the readable version of the article.
-
-    Here's we're going to remove unlikely nodes, find scores on the rest, and
-    clean up and return the final best match.
-
-    """
-    scorable_node_tags = SCORABLE_TAGS
-    nodes_to_score = []
-    should_remove = []
-
-    for node in doc.iter():
-        if is_unlikely_node(node):
-            LOG.debug('We should drop unlikely: ' + str(node))
-            should_remove.append(node)
-            continue
-        if node.tag == 'a' and is_bad_link(node):
-            LOG.debug('We should drop bad link: ' + str(node))
-            should_remove.append(node)
-            continue
-        if node.tag in scorable_node_tags and node not in nodes_to_score:
-            nodes_to_score.append(node)
-    return score_candidates(nodes_to_score), should_remove
-
-
-class Article(object):
-    """Parsed readable object"""
-    _should_drop = []
-
-    def __init__(self, html, url=None, fragment=True):
-        """Create the Article we're going to use.
-
-        :param html: The string of html we're going to parse.
-        :param url: The url so we can adjust the links to still work.
-        :param fragment: Should we return a <div> fragment or a full <html>
-        doc.
-
-        """
-        LOG.debug('Url: ' + str(url))
-        self.orig = OriginalDocument(html, url=url)
-        self.fragment = fragment
-
-    def __str__(self):
-        return tostring(self._readable)
-
-    def __unicode__(self):
-        return tounicode(self._readable)
-
-    @cached_property(ttl=600)
-    def doc(self):
-        """The doc is the parsed xml tree of the given html."""
-        try:
-            doc = self.orig.html
-            # cleaning doesn't return, just wipes in place
-            html_cleaner(doc)
-            doc = drop_tag(doc, 'noscript', 'iframe')
-            doc = transform_misused_divs_into_paragraphs(doc)
-            return doc
-        except ValueError:
-            return None
-
-    @cached_property(ttl=600)
-    def candidates(self):
-        """Generate the list of candidates from the doc."""
-        doc = self.doc
-        if doc is not None and len(doc):
-            candidates, should_drop = find_candidates(doc)
-            self._should_drop = should_drop
-            return candidates
-        else:
-            return None
-
-    @cached_property(ttl=600)
-    def readable(self):
-        return tounicode(self._readable)
-
-    @cached_property(ttl=600)
-    def _readable(self):
-        """The readable parsed article"""
-        if self.candidates:
-            LOG.debug('Candidates found:')
-            pp = PrettyPrinter(indent=2)
-
-            # cleanup by removing the should_drop we spotted.
-            [n.drop_tree() for n in self._should_drop
-                if n.getparent() is not None]
-
-            # right now we return the highest scoring candidate content
-            by_score = sorted([c for c in self.candidates.values()],
-                key=attrgetter('content_score'), reverse=True)
-            LOG.debug(pp.pformat(by_score))
-
-            # since we have several candidates, check the winner's siblings
-            # for extra content
-            winner = by_score[0]
-            LOG.debug('Selected winning node: ' + str(winner))
-            updated_winner = check_siblings(winner, self.candidates)
-            LOG.debug('Begin final prep of article')
-            updated_winner.node = prep_article(updated_winner.node)
-            if updated_winner.node is not None:
-                doc = build_base_document(updated_winner.node, self.fragment)
-            else:
-                LOG.warning('Had candidates but failed to find a cleaned winning doc.')
-                doc = self._handle_no_candidates()
-        else:
-            LOG.warning('No candidates found: using document.')
-            LOG.debug('Begin final prep of article')
-            doc = self._handle_no_candidates()
-
-        return doc
-
-    def _handle_no_candidates(self):
-        """If we fail to find a good candidate we need to find something else."""
-        # since we've not found a good candidate we're should help this
-        if self.doc is not None and len(self.doc):
-            # cleanup by removing the should_drop we spotted.
-            [n.drop_tree() for n in self._should_drop
-                if n.getparent() is not None]
-            doc = prep_article(self.doc)
-            doc = build_base_document(doc, self.fragment)
-        else:
-            LOG.warning('No document to use.')
-            doc = build_error_document(self.fragment)
-
-        return doc
--- a/scoring.bak.py
+++ b/scoring.bak.py
@ -1,237 +0,0 @@
-"""Handle dealing with scoring nodes and content for our parsing."""
-import re
-from hashlib import md5
-from lxml.etree import tounicode
-
-from breadability.logconfig import LNODE
-from breadability.logconfig import LOG
-
-# A series of sets of attributes we check to help in determining if a node is
-# a potential candidate or not.
-CLS_UNLIKELY = re.compile(('combx|comment|community|disqus|extra|foot|header|'
-    'menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|'
-    'pager|perma|popup|tweet|twitter'), re.I)
-CLS_MAYBE = re.compile('and|article|body|column|main|shadow', re.I)
-CLS_WEIGHT_POSITIVE = re.compile(('article|body|content|entry|hentry|main|'
-    'page|pagination|post|text|blog|story'), re.I)
-CLS_WEIGHT_NEGATIVE = re.compile(('combx|comment|com-|contact|foot|footer|'
-    'footnote|head|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|'
-    'sidebar|sponsor|shopping|tags|tool|widget'), re.I)
-
-
-def check_node_attr(node, attr, checkset):
-    value = node.get(attr) or ""
-    check = checkset.search(value)
-    if check:
-        return True
-    else:
-        return False
-
-
-def generate_hash_id(node):
-    """Generate a hash_id for the node in question.
-
-    :param node: lxml etree node
-
-    """
-    content = tounicode(node)
-    hashed = md5()
-    try:
-        hashed.update(content.encode('utf-8', "replace"))
-    except Exception, e:
-        LOG.error("BOOM! " + str(e))
-
-    return hashed.hexdigest()[0:8]
-
-
-def get_link_density(node, node_text=None):
-    """Generate a value for the number of links in the node.
-
-    :param node: pared elementree node
-    :param node_text: if we already have the text_content() make this easier
-    on us.
-    :returns float:
-
-    """
-    link_length = sum([len(a.text_content()) or 0
-                      for a in node.findall(".//a")])
-    # For each img, give 50 bonus chars worth of length.
-    # Tweaking this 50 down a notch should help if we hit false positives.
-    link_length = max(link_length -
-                      sum([50 for img in node.findall(".//img")]), 0)
-    if node_text:
-        text_length = len(node_text)
-    else:
-        text_length = len(node.text_content())
-    return float(link_length) / max(text_length, 1)
-
-
-def get_class_weight(node):
-    """Get an elements class/id weight.
-
-    We're using sets to help efficiently check for existence of matches.
-
-    """
-    weight = 0
-    if check_node_attr(node, 'class', CLS_WEIGHT_NEGATIVE):
-        weight = weight - 25
-    if check_node_attr(node, 'class', CLS_WEIGHT_POSITIVE):
-        weight = weight + 25
-
-    if check_node_attr(node, 'id', CLS_WEIGHT_NEGATIVE):
-        weight = weight - 25
-    if check_node_attr(node, 'id', CLS_WEIGHT_POSITIVE):
-        weight = weight + 25
-
-    return weight
-
-
-def is_unlikely_node(node):
-    """Short helper for checking unlikely status.
-
-    If the class or id are in the unlikely list, and there's not also a
-    class/id in the likely list then it might need to be removed.
-
-    """
-    unlikely = check_node_attr(node, 'class', CLS_UNLIKELY) or \
-        check_node_attr(node, 'id', CLS_UNLIKELY)
-
-    maybe = check_node_attr(node, 'class', CLS_MAYBE) or \
-        check_node_attr(node, 'id', CLS_MAYBE)
-
-    if unlikely and not maybe and node.tag != 'body':
-        return True
-    else:
-        return False
-
-
-def score_candidates(nodes):
-    """Given a list of potential nodes, find some initial scores to start"""
-    MIN_HIT_LENTH = 25
-    candidates = {}
-
-    for node in nodes:
-        LNODE.log(node, 1, "Scoring Node")
-
-        content_score = 0
-        # if the node has no parent it knows of, then it ends up creating a
-        # body and html tag to parent the html fragment.
-        parent = node.getparent()
-        grand = parent.getparent() if parent is not None else None
-        innertext = node.text_content()
-
-        if parent is None or grand is None:
-            LNODE.log(
-                node, 1,
-                "Skipping candidate because parent/grand are none")
-            continue
-
-        # If this paragraph is less than 25 characters, don't even count it.
-        if innertext and len(innertext) < MIN_HIT_LENTH:
-            LNODE.log(
-                node, 1,
-                "Skipping candidate because not enough content.")
-            continue
-
-        # Initialize readability data for the parent.
-        # if the parent node isn't in the candidate list, add it
-        if parent not in candidates:
-            candidates[parent] = ScoredNode(parent)
-
-        if grand not in candidates:
-            candidates[grand] = ScoredNode(grand)
-
-        # Add a point for the paragraph itself as a base.
-        content_score += 1
-
-        if innertext:
-            # Add 0.25 points for any commas within this paragraph
-            content_score += innertext.count(',') * 0.25
-            LNODE.log(node, 1,
-                "Bonus points for ,: " + str(innertext.count(',')))
-
-            # Subtract 0.5 points for each double quote within this paragraph
-            content_score += innertext.count('"') * (-0.5)
-            LNODE.log(node, 1,
-                'Penalty points for ": ' + str(innertext.count('"')))
-
-            # For every 100 characters in this paragraph, add another point.
-            # Up to 3 points.
-            length_points = len(innertext) / 100
-
-            if length_points > 3:
-                content_score += 3
-            else:
-                content_score += length_points
-            LNODE.log(
-                node, 1,
-                "Length/content points: {0} : {1}".format(length_points,
-                                                          content_score))
-
-        # Add the score to the parent.
-        LNODE.log(node, 1, "From this current node.")
-        candidates[parent].content_score += content_score
-        LNODE.log(
-            candidates[parent].node,
-            1,
-            "Giving parent bonus points: " + str(
-                candidates[parent].content_score))
-        # The grandparent gets half.
-        LNODE.log(candidates[grand].node, 1, "Giving grand bonus points")
-        candidates[grand].content_score += (content_score / 2.0)
-        LNODE.log(
-            candidates[parent].node,
-            1,
-            "Giving grand bonus points: " + str(
-                candidates[grand].content_score))
-
-    for candidate in candidates.values():
-        adjustment = 1 - get_link_density(candidate.node)
-        LNODE.log(
-            candidate.node,
-            1,
-            "Getting link density adjustment: {0} * {1} ".format(
-                candidate.content_score, adjustment))
-        candidate.content_score = candidate.content_score * (adjustment)
-
-    return candidates
-
-
-class ScoredNode(object):
-    """We need Scored nodes we use to track possible article matches
-
-    We might have a bunch of these so we use __slots__ to keep memory usage
-    down.
-
-    """
-    __slots__ = ['node', 'content_score']
-
-    def __repr__(self):
-        """Helpful representation of our Scored Node"""
-        return "{0}: {1:0.1F}\t{2}".format(
-            self.hash_id,
-            self.content_score,
-            self.node)
-
-    def __init__(self, node):
-        """Given node, set an initial score and weigh based on css and id"""
-        self.node = node
-        content_score = 0
-        if node.tag in ['div', 'article']:
-            content_score = 5
-
-        if node.tag in ['pre', 'td', 'blockquote']:
-            content_score = 3
-
-        if node.tag in ['address', 'ol', 'ul', 'dl', 'dd', 'dt', 'li',
-            'form']:
-            content_score = -3
-        if node.tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th']:
-            content_score = -5
-
-        content_score += get_class_weight(node)
-        self.content_score = content_score
-
-    @property
-    def hash_id(self):
-        return generate_hash_id(self.node)