Tests passes for both Python v2.7, v3.3

pull/21/head
Mišo Belica 11 years ago
parent 912bb50b76
commit 94f6b0a84e

@ -2,6 +2,8 @@ language: python
python:
- "2.7"
- "2.6"
- "3.2"
- "3.3"
before_install: sudo apt-get install libxml2-dev libxslt-dev
# command to install dependencies
install: pip install -r requirements.txt --use-mirrors

@ -1,3 +1,3 @@
VERSION = '0.1.11'
import client
from scripts import newtest
from ._version import VERSION
from .scripts import newtest
from . import client

@ -0,0 +1,95 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals
from sys import version_info
PY3 = version_info[0] == 3
if PY3:
bytes = bytes
unicode = str
else:
bytes = str
unicode = unicode
string_types = (bytes, unicode,)
try:
callable = callable
except NameError:
def callable(object):
"""Checks if given object is callable."""
return hasattr(object, "__call__")
try:
import urllib2 as urllib
except ImportError:
import urllib.request as urllib
def to_string(object):
return to_unicode(object) if PY3 else to_bytes(object)
def to_bytes(object):
try:
if isinstance(object, bytes):
return object
elif isinstance(object, unicode):
return object.encode("utf8")
else:
# try encode instance to bytes
return instance_to_bytes(object)
except UnicodeError:
# recover from codec error and use 'repr' function
return to_bytes(repr(object))
def to_unicode(object):
try:
if isinstance(object, unicode):
return object
elif isinstance(object, bytes):
return object.decode("utf8")
else:
# try decode instance to unicode
return instance_to_unicode(object)
except UnicodeError:
# recover from codec error and use 'repr' function
return to_unicode(repr(object))
def instance_to_bytes(instance):
if PY3:
if hasattr(instance, "__bytes__"):
return bytes(instance)
elif hasattr(instance, "__str__"):
return unicode(instance).encode("utf8")
else:
if hasattr(instance, "__str__"):
return bytes(instance)
elif hasattr(instance, "__unicode__"):
return unicode(instance).encode("utf8")
return to_bytes(repr(instance))
def instance_to_unicode(instance):
if PY3:
if hasattr(instance, "__str__"):
return unicode(instance)
elif hasattr(instance, "__bytes__"):
return bytes(instance).decode("utf8")
else:
if hasattr(instance, "__unicode__"):
return unicode(instance)
elif hasattr(instance, "__str__"):
return bytes(instance).decode("utf8")
return to_unicode(repr(instance))

@ -0,0 +1 @@
VERSION = '0.1.11'

@ -1,3 +1,7 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
import argparse
import codecs
import locale
@ -7,11 +11,11 @@ import webbrowser
from tempfile import mkstemp
from breadability import VERSION
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.logconfig import set_logging_level
from breadability.readable import Article
from ._version import VERSION
from .logconfig import LOG
from .logconfig import LNODE
from .logconfig import set_logging_level
from .readable import Article
LOGLEVEL = 'WARNING'

@ -1,5 +1,9 @@
# -*- coding: utf8 -*-
"""Generate a clean nice starting html document to process for an article."""
from __future__ import absolute_import
import chardet
import re
from lxml.etree import tostring
@ -8,8 +12,9 @@ from lxml.etree import XMLSyntaxError
from lxml.html import document_fromstring
from lxml.html import HTMLParser
from breadability.logconfig import LOG
from breadability.utils import cached_property
from ._py3k import unicode, to_string
from .logconfig import LOG
from .utils import cached_property
utf8_parser = HTMLParser(encoding='utf-8')
@ -60,7 +65,7 @@ def build_doc(page):
page_unicode.encode('utf-8', 'replace'),
parser=utf8_parser)
return doc
except XMLSyntaxError, exc:
except XMLSyntaxError as exc:
LOG.error('Failed to parse: ' + str(exc))
raise ValueError('Failed to parse document contents.')
@ -75,7 +80,7 @@ class OriginalDocument(object):
def __str__(self):
"""Render out our document as a string"""
return tostring(self.html)
return to_string(tostring(self.html))
def __unicode__(self):
"""Render out our document as a string"""

@ -121,7 +121,7 @@ class LogHelper(object):
hashed = md5()
try:
hashed.update(content.encode('utf-8', errors="replace"))
except Exception, exc:
except Exception as exc:
LOG.error("Cannot hash the current node." + str(exc))
hash_id = hashed.hexdigest()[0:8]
# if hash_id in ['9c880b27', '8393b7d7', '69bfebdd']:
@ -162,7 +162,7 @@ class _LogFormatter(logging.Formatter):
def format(self, record):
try:
record.message = record.getMessage()
except Exception, e:
except Exception as e:
record.message = "Bad message (%r): %r" % (e, record.__dict__)
record.asctime = time.strftime(
"%y%m%d %H:%M:%S", self.converter(record.created))

@ -1,3 +1,7 @@
# -*- coding: utf8 -*-
from __future__ import absolute_import
import re
from lxml.etree import tounicode
from lxml.etree import tostring
@ -7,14 +11,14 @@ from lxml.html import fromstring
from operator import attrgetter
from pprint import PrettyPrinter
from breadability.document import OriginalDocument
from breadability.logconfig import LOG
from breadability.logconfig import LNODE
from breadability.scoring import score_candidates
from breadability.scoring import get_link_density
from breadability.scoring import get_class_weight
from breadability.scoring import is_unlikely_node
from breadability.utils import cached_property
from .document import OriginalDocument
from .logconfig import LOG
from .logconfig import LNODE
from .scoring import score_candidates
from .scoring import get_link_density
from .scoring import get_class_weight
from .scoring import is_unlikely_node
from .utils import cached_property
html_cleaner = Cleaner(scripts=True, javascript=True, comments=True,

@ -1,10 +1,15 @@
# -*- coding: utf8 -*-
"""Handle dealing with scoring nodes and content for our parsing."""
from __future__ import absolute_import
import re
from hashlib import md5
from lxml.etree import tounicode
from breadability.logconfig import LNODE
from breadability.logconfig import LOG
from .logconfig import LNODE
from .logconfig import LOG
# A series of sets of attributes we check to help in determining if a node is
# a potential candidate or not.
@ -38,7 +43,7 @@ def generate_hash_id(node):
hashed = md5()
try:
hashed.update(content.encode('utf-8', "replace"))
except Exception, e:
except Exception as e:
LOG.error("BOOM! " + str(e))
return hashed.hexdigest()[0:8]
@ -153,7 +158,7 @@ def score_candidates(nodes):
# For every 100 characters in this paragraph, add another point.
# Up to 3 points.
length_points = len(innertext) / 100
length_points = len(innertext) // 100
if length_points > 3:
content_score += 3

@ -1,10 +1,10 @@
import argparse
import codecs
import urllib2
from os import mkdir
from os import path
from breadability import VERSION
from .._version import VERSION
from .._py3k import urllib
TESTPATH = path.join(
@ -87,7 +87,7 @@ def make_files(dirname):
def fetch_article(dirname, url):
"""Get the content of the url and make it the article.html"""
opener = urllib2.build_opener()
opener = urllib.build_opener()
opener.addheaders = [('Accept-Charset', 'utf-8')]
url_response = opener.open(url)
dl_html = url_response.read().decode('utf-8')

@ -6,6 +6,7 @@ try:
except ImportError:
import unittest
from breadability._py3k import to_unicode
from breadability.document import OriginalDocument
from utils import load_snippet
@ -17,7 +18,7 @@ class TestOriginalDocument(unittest.TestCase):
def test_readin_min_document(self):
"""Verify we can read in a min html document"""
doc = OriginalDocument(load_snippet('document_min.html'))
self.assertTrue(str(doc).startswith(u'<html>'))
self.assertTrue(to_unicode(doc).startswith(u'<html>'))
self.assertEqual(doc.title, 'Min Document Title')
def test_readin_with_base_url(self):
@ -25,7 +26,7 @@ class TestOriginalDocument(unittest.TestCase):
doc = OriginalDocument(
load_snippet('document_absolute_url.html'),
url="http://blog.mitechie.com/test.html")
self.assertTrue(str(doc).startswith(u'<html>'))
self.assertTrue(to_unicode(doc).startswith(u'<html>'))
# find the links on the page and make sure each one starts with out
# base url we told it to use.

Loading…
Cancel
Save