basic structure

pull/1/head
Adam Pash 8 years ago
parent 155efb3833
commit 7f95b9f44f

@ -0,0 +1,183 @@
'use strict';
function _interopDefault (ex) { return (ex && (typeof ex === 'object') && 'default' in ex) ? ex['default'] : ex; }
var fs = _interopDefault(require('fs'));
const PHOTO_HINTS = [
'figure',
'photo',
'image',
'caption'
]
const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i')
const POSITIVE_LEAD_IMAGE_URL_HINTS = [
'upload',
'wp-content',
'large',
'photo',
'wp-image',
]
const POSITIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(POSITIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
const NEGATIVE_LEAD_IMAGE_URL_HINTS = [
'spacer',
'sprite',
'blank',
'throbber',
'gradient',
'tile',
'bg',
'background',
'icon',
'social',
'header',
'hdr',
'advert',
'spinner',
'loader',
'loading',
'default',
'rating',
'share',
'facebook',
'twitter',
'theme',
'promo',
'ads',
'wp-includes',
]
const NEGATIVE_LEAD_IMAGE_URL_HINTS_RE = new RegExp(NEGATIVE_LEAD_IMAGE_URL_HINTS.join('|'), 'i')
// A list of strings that denote a positive scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
const POSITIVE_SCORE_HINTS = [
'article',
'articlecontent',
'instapaper_body',
'blog',
'body',
'content',
'entry-content-asset',
'entry',
'hentry',
'main',
'Normal',
'page',
'pagination',
'permalink',
'post',
'story',
'text',
'[-_]copy', //usatoday
'\Bcopy'
]
// The above list, joined into a matching regular expression
const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i')
// A list of strings that denote a negative scoring for this content as being
// an article container. Checked against className and id.
//
// TODO: Perhaps have these scale based on their odds of being quality?
const NEGATIVE_SCORE_HINTS = [
'adbox',
'advert',
'author',
'bio',
'bookmark',
'bottom',
'byline',
'clear',
'com-',
'combx',
'comment',
'comment\B',
'contact',
'copy',
'credit',
'crumb',
'date',
'deck',
'excerpt',
'featured', //tnr.com has a featured_content which throws us off
'foot',
'footer',
'footnote',
'graf',
'head',
'info',
'infotext', //newscientist.com copyright
'instapaper_ignore',
'jump',
'linebreak',
'link',
'masthead',
'media',
'meta',
'modal',
'outbrain', //slate.com junk
'promo',
'pr_', // autoblog - press release
'related',
'respond',
'roundcontent', //lifehacker restricted content warning
'scroll',
'secondary',
'share',
'shopping',
'shoutbox',
'side',
'sidebar',
'sponsor',
'stamp',
'sub',
'summary',
'tags',
'tools',
'widget'
]
// The above list, joined into a matching regular expression
const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i')
// A list of words that, if found in link text or URLs, likely mean that
// this link is not a next page link.
const EXTRANEOUS_LINK_HINTS = [
'print',
'archive',
'comment',
'discuss',
'e-mail',
'email',
'share',
'reply',
'all',
'login',
'sign',
'single',
'adx',
'entry-unrelated'
]
const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i')
const GenericContentExtractor = {
parse: (html) => {
return html
}
}
const GenericExtractor = {
parse: (html) => {
return {
content: GenericContentExtractor.parse(html)
}
}
}
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
const result = GenericExtractor.parse(html)
console.log(result)

@ -0,0 +1,821 @@
import CONSTANTS from './constants'
const GenericContentExtractor = {
parse: (html) => {
return html
}
}
// import logging
//
// from copy import deepcopy
// from . import constants
// from utils.dom.attribmap import AttribMap
// import lxml.html
// from lxml.html import builder as E
// from utils.dom import (
// extract_by_selector,
// inner_text,
// link_density,
// node_to_html,
// )
// from utils.text import (has_sentence_end, normalize_spaces)
// from utils.html import strip_tags
//
// logger = logging.getLogger(__name__)
// # Shortcut
// RE_NAMESPACE = {'re': constants.RE_NS}
//
// class GenericContentExtractor(object):
// """Article content extraction is a beast. For clarities sake, it is
// broken out into its own component.
//
// """
//
// def __init__(self, resource, flags=None, title=None):
// self.resource = resource
//
// self.title = title
//
// if flags:
// self.flags = flags
// else:
// self.flags = {
// "strip_unlikely_candidates": True,
// "weight_nodes": True,
// "clean_conditionally": True,
// }
//
// def extract(self, return_type="html"):
// """ Extract the content for this resource - initially, pass in our
// most restrictive flags which will return the highest quality
// content. On each failure, retry with slightly more lax flags.
//
// :param return_type: string. If "node", should return the content
// as an LXML node rather than as an HTML string.
//
// Flags:
// strip_unlikely_candidates: Remove any elements that match
// non-article-like criteria first.(Like, does this element
// have a classname of "comment")
//
// weight_nodes: Modify an elements score based on whether it has
// certain classNames or IDs. Examples: Subtract if a node has
// a className of 'comment', Add if a node has an ID of
// 'entry-content'.
//
// clean_conditionally: Clean the node to return of some
// superfluous content. Things like forms, ads, etc.
// """
// extraction_flags = ['strip_unlikely_candidates', 'weight_nodes', 'clean_conditionally']
//
// # Cascade through our extraction-specific flags in an ordered fashion,
// # turning them off as we try to extract content.
// clean_conditionally = self.flags.get('clean_conditionally', False)
// node = self.extract_clean_node(
// self._extract_best_node(),
// clean_conditionally=clean_conditionally)
//
// if not self.node_is_sufficient(node):
// # We didn't succeed on first pass, one by one disable our
// # extraction flags and try again.
// for flag in extraction_flags:
// self.flags[flag] = False
// clean_conditionally = self.flags.get(
// 'clean_conditionally',
// False
// )
// node = self.extract_clean_node(
// self._extract_best_node(),
// clean_conditionally=clean_conditionally
// )
//
// # If we found a good node, break out of the flag loop.
// if self.node_is_sufficient(node):
// break
//
// # Once we got here, either we're at our last-resort node, or
// # we broke early. Make sure we at least have -something- before we
// # move forward.
// if node is None:
// return None
//
// # Remove our scoring information from our content
// if 'score' in node.attrib:
// del node.attrib['score']
// for scored_node in node.xpath('./#<{(|[@score]'):
// del scored_node.attrib['score']
//
// if return_type == "html":
// return normalize_spaces(node_to_html(node))
// else:
// return node
//
// def node_is_sufficient(self, node):
// """Given a node, determine if it is article-like enough to return."""
// return (isinstance(node, lxml.html.HtmlElement) and
// len(inner_text(node)) >= 100)
//
//
// def _extract_best_node(self):
// """ Using a variety of scoring techniques, extract the content most
// likely to be article text.
//
// If strip_unlikely_candidates is True, remove any elements that
// match certain criteria first. (Like, does this element have a
// classname of "comment")
//
// If weight_nodes is True, use classNames and IDs to determine the
// worthiness of nodes.
//
// Returns an lxml node.
// """
//
// # deep clone the node so we can get back to our initial parsed state
// # if needed
// # TODO: Performance improvements here? Deepcopy is known to be slow.
// # Can we avoid this somehow?
// root = deepcopy(self.resource)
//
// if self.flags['strip_unlikely_candidates']:
// self._strip_unlikely_candidates(root)
//
// self._convert_to_paragraphs(root)
// self._score_content(root, weight_nodes=self.flags['weight_nodes'])
//
// # print structure(root)
//
// top_candidate = self._find_top_candidate(root)
//
// return top_candidate
//
// def get_weight(self, node):
// """ Get the score of a node based on its className and id. """
// score = 0
//
// if node.get('id'):
// if constants.NEGATIVE_SCORE_RE.search(node.get('id')):
// score -= 25
// if constants.POSITIVE_SCORE_RE.search(node.get('id')):
// score += 25
//
// if node.get('class'):
// # Only score classes on negative/positive if the ID didn't match.
// if score == 0:
// if constants.NEGATIVE_SCORE_RE.search(node.get('class')):
// score -= 25
// if constants.POSITIVE_SCORE_RE.search(node.get('class')):
// score += 25
//
// # Try to keep photos if we can.
// if constants.PHOTO_HINTS_RE.search(node.get('class')):
// score += 10
//
// # Bonus for entry-content-asset, which is explicitly denoted to be
// # more valuable to Readability in the publisher guidelines.
// if 'entry-content-asset' in node.get('class'):
// score += 25
//
// return score
//
//
// # The removal is implemented as a blacklist and whitelist, this test finds
// # blacklisted elements that aren't whitelisted. We do this all in one
// # expression-both because it's only one pass, and because this skips the
// # serialization for whitelisted nodes.
// candidates_blacklist = '|'.join(constants.UNLIKELY_CANDIDATES_BLACKLIST)
// candidates_whitelist = '|'.join(constants.UNLIKELY_CANDIDATES_WHITELIST)
//
// # Note: Regular expressions appear to be about 3 times as fast as looping
// # over each key and matching with contains().
// #
// # TODO: Consider mapping all classnames and ids to hashes and using set
// # intersections for performance.
// candidates_xpath = (
// './#<{(|['
// 'not(self::a) and '
// 're:test(concat(@id, " ", @class), "%s", "i") and '
// 'not( re:test(concat(@id, " ", @class), "%s", "i"))'
// ']'
// ) % (candidates_blacklist, candidates_whitelist)
// def _strip_unlikely_candidates(self, doc):
// """ Loop through the provided document and remove any non-link nodes
// that are unlikely candidates for article content.
//
// Links are ignored because there are very often links to content
// that are identified as non-body-content, but may be inside
// article-like content.
//
// :param doc: an LXML doc to strip nodes from
// :return node: The node itself (even though the conversion happens
// by-reference)
// """
// unlikely_candidates = doc.xpath(self.candidates_xpath,
// namespaces=RE_NAMESPACE)
//
// for node in unlikely_candidates:
// node.drop_tree()
//
// return doc
//
// def _convert_to_paragraphs(self, doc):
// """ Loop through the provided doc, and convert any p-like elements to
// actual paragraph tags.
//
// Things fitting this criteria:
// * Multiple consecutive <br /> tags.
// * <div /> tags without block level elements inside of them
// * <span /> tags who are not children of <p /> or <div /> tags.
//
// :param doc: An LXML node to search through.
// :return an LXML node of the element, cleaned up.
// (By-reference mutation, though. Returned just for convenience.)
// """
//
// # Convert every doubled-<br /> to a paragraph tag.
// self._brs_to_paragraphs(doc)
//
// # Convert every shallow <div /> to a paragraph tag. Ignore divs that
// # contain other block level elements.
// inner_block_tags = './/' + ' or .//'.join(constants.DIV_TO_P_BLOCK_TAGS)
// shallow_divs = doc.xpath('.//div[not(%s)]' % inner_block_tags)
//
// for div in shallow_divs:
// div.tag = 'p'
//
// # Convert every span tag who has no ancestor p or div tag within their
// # family tree to a P as well.
// p_like_spans = doc.xpath('.//span[not(ancestor::p or ancestor::div)]')
// for span in p_like_spans:
// span.tag = 'p'
//
// # If, after all of this, we have no P tags at all, we are probably
// # dealing with some very ugly content that is separated by single BR
// # tags. Convert them individually to P tags.
// if int(doc.xpath('count(//p)')) == 0:
// self._brs_to_paragraphs(doc, min_consecutive=1)
//
// # Remove font and center tags, which are ugly and annoying
// for fonttag in doc.xpath('.//font | .//center'):
// fonttag.drop_tag()
//
//
// ### DO WE EVEN NEED THIS?? -Chris ###
//
// # # Due to the way the paras are inserted, the first paragraph does not
// # # get captured. Since this first para can contain all sorts of random
// # # junk (links, drop caps, images) it's not easy to regex our way to
// # # victory so we do it via dom. - Karl G
// # try:
// # first = node.xpath('.//p[@class = "rdb_br"][position() = 1]')[0]
// # except IndexError:
// # pass
// # else:
// # parent = first.getparent()
// # breaker = None
// # if parent is None:
// # parent = node
// # para = E.P({'class':'rdb_br firstp'})
// # has_predecessors = False
// # for sibling in first.itersiblings(preceding = True):
// # has_predecessors = True
// # if sibling.tag in ['p', 'div']:
// # breaker = sibling
// # break
// # para.insert(0,sibling)
// #
// # if (not has_predecessors and parent.text is not None and
// # parent.text.strip() != ""):
// # para.text = parent.text
// # parent.text = ''
// # else:
// # para.text = (para.text or '') + (parent.tail or '')
// #
// # parent.tail = ''
// # if breaker is None:
// # parent.insert(0,para)
// # else:
// # parent.insert(parent.index(breaker)+1,para)
//
// return doc
//
// def _brs_to_paragraphs(self, doc, min_consecutive=2):
// """ Given an LXML document, convert consecutive <br /> tags into
// <p /> tags instead.
//
// :param doc: An LXML document to convert within.
// :param min_consecutive: Integer, the minimum number of consecutive
// <br /> tags that must exist for them to be converted to <p />
// tags. Must be at least 1.
//
// A word to the wise: This is deceptively tricky, as break tags
// don't behave like normal XML should. Make sure you test
// thoroughly if you make any changes to this code.
// """
// brs = doc.xpath('.//br')
//
// # Loop through all of our break tags, looking for consecutive
// # <br />s with no content in between them. If found, replace them
// # with a single P tag.
// for br in brs:
// # Generate a list of all the breaks in a row, with no text in
// # between them.
// joined_brs = []
// cur_br = br
// while True:
// joined_brs.append(cur_br)
//
// if cur_br.tail:
// break
//
// next = cur_br.getnext()
// next_is_br = next is not None and next.tag.lower() == 'br'
//
// if next_is_br:
// cur_br = next
// else:
// break
//
// if len(joined_brs) < min_consecutive:
// continue
//
// last_br = joined_brs[-1]
//
// # Now loop through following siblings, until we hit a block
// # tag or the end, and append them to this P if they are not a
// # block tag that is not a BR.
// self._paragraphize(last_br)
//
// # Drop every break that we no longer need because of the P.
// # The first BR has been turned into a P tag.
// for joined_br in joined_brs:
// if joined_br is not last_br:
// joined_br.drop_tag()
//
// # If we had any new p tags that are already inside a P tag, resolve
// # those by paragraphizing them, which will append their block level
// # contents.
// for fix_count in xrange(1000):
// # Find the first p that contains another p, and paragraphize it.
// # We do this in a loop because we're modifying the dom as we go.
// try:
// parent_p = doc.xpath('//p[./p][1]')[0]
// self._paragraphize(parent_p)
// except IndexError:
// break
// else:
// # We exhausted our loop, which means we've looped too many times
// # such that it's unreasonable. Log a warning.
// logger.warning("Bailing on p parent fix due to crazy "
// "looping for url %s" % self.resource.url)
//
// def _paragraphize(self, node):
// """ Given a node, turn it into a P if it is not already a P, and
// make sure it conforms to the constraints of a P tag (I.E. does
// not contain any other block tags.)
//
// If the node is a <br />, it treats the following inline siblings
// as if they were its children.
//
// :param node: The node to paragraphize
// """
// is_br = (node.tag.lower() == 'br')
//
// if is_br and node.tail:
// node.text = node.tail
// node.tail = None
//
// node.tag = 'p'
//
// if is_br:
// sibling = node.getnext()
// while True:
// if (sibling is None or (
// sibling.tag in constants.BLOCK_LEVEL_TAGS and
// sibling.tag != 'br'
// )):
// break
//
// next_sibling = sibling.getnext()
// node.append(sibling)
// sibling = next_sibling
//
// else:
// children = node.getchildren()
// i = 0
// il = len(children)
// # Ghetto looping so we have access to the iterator afterward
// while i < il:
// child = children[i]
// if (child is None or
// (child.tag in constants.BLOCK_LEVEL_TAGS and
// child.tag != 'br')
// ):
// break
// i = i+1
//
// # This means we encountered a block level tag within our P,
// # so we should pop the rest down to siblings.
// if i < il:
// for j in xrange(i, il):
// node.addnext(children[j])
//
//
// ### --- SCORING --- ###
//
// def _get_score(self, node, weight_nodes=True):
// """Get a node's score. If weight_nodes is true, weight classes when
// getting the score as well.
//
// """
// score = node.get('score')
// if score is None:
// score = self._score_node(node)
// if weight_nodes:
// score += self.get_weight(node)
// parent = node.getparent()
// if parent is not None:
// self._set_score(parent, self._get_score(parent) + .25 * score)
// else:
// score = float(score)
// return score
//
// def _set_score(self, node, val):
// """Set the score of a node to val"""
// return node.set('score', str(val))
//
// def _add_score(self, node, val, weight_nodes=True):
// return self._set_score(node, self._get_score(node, weight_nodes) + val)
//
// def _score_content(self, doc, weight_nodes=True):
// """score content. Parents get the full value of their children's
// content score, grandparents half
// """
//
// # First, look for special hNews based selectors and give them a big
// # boost, if they exist
// for selector in constants.HNEWS_CONTENT_SELECTORS:
// # Not self.resource.extract_by_selector because our doc is a copy
// # of the resource doc.
// nodes = extract_by_selector(doc, selector,
// AttribMap(doc))
// for node in nodes:
// self._add_score(node, 80)
//
// paras = doc.xpath('.//p | .//pre')
//
// # If we don't have any paragraphs at all, we can't score based on
// # paragraphs, so return without modifying anything else.
// if len(paras) == 0:
// return doc
//
// for para in paras:
// # Don't score invalid tags
// if not isinstance(para.tag, basestring):
// continue
//
// # The raw score for this paragraph, before we add any parent/child
// # scores.
// raw_score = self._score_node(para)
// self._set_score(para, self._get_score(para, weight_nodes))
//
// parent = para.getparent()
// if parent is not None:
// if parent.tag == 'span':
// parent.tag = 'div'
//
// # Add the individual content score to the parent node
// self._add_score(parent, raw_score, weight_nodes=weight_nodes)
//
// grandparent = parent.getparent()
// if grandparent is not None:
// if grandparent.tag == 'span':
// grandparent.tag = 'div'
//
// # Add half of the individual content score to the
// # grandparent
// gp_score = raw_score / 2.0
// self._add_score(grandparent, gp_score, weight_nodes=weight_nodes)
//
// return doc
//
//
// def _score_node(self, node):
// """Score an individual node. Has some smarts for paragraphs, otherwise
// just scores based on tag currently.
//
// """
// score = 0
//
// if node.tag in ['p', 'li', 'span', 'pre']:
// score += self._score_paragraph(node)
// if node.tag in ['div']:
// score += 5
// elif node.tag in ['pre', 'td', 'blockquote', 'ol', 'ul', 'dl']:
// score += 3
// elif node.tag in ['address', 'form']:
// score -= 3
// elif node.tag in ['th']:
// score -= 5
//
// return score
//
// def _score_paragraph(self, node):
// """Score a paragraph using various methods. Things like number of
// commas, etc. Higher is better.
//
// """
//
// # Start with a point for the paragraph itself as a base.
// score = 1
// text = inner_text(node)
// text_len = len(text)
//
// if text_len == 0:
// if node.getparent() is not None and len(node.getchildren()) == 0:
// node.drop_tree()
// return 0
//
// # If this paragraph is less than 25 characters, don't count it.
// if text_len < 25:
// return 0
//
// # Add points for any commas within this paragraph
// score += text.count(',')
//
// # For every 50 characters in this paragraph, add another point. Up
// # to 3 points.
// chunk_count = (text_len / 50)
// if chunk_count > 0:
// length_bonus = 0
// if node.tag in ('pre', 'p'):
// length_bonus += chunk_count - 2
// else:
// length_bonus += chunk_count - 1.25
// score += min(max(length_bonus, 0), 3)
//
// # Articles can end with short paragraphs when people are being clever
// # but they can also end with short paragraphs setting up lists of junk
// # that we strip. This negative tweaks junk setup paragraphs just below
// # the cutoff threshold.
// if text.endswith(':'):
// score -= 1
//
// return score
//
// ### ------- TOP CANDIDATE EXTRACTION ------ ###
//
// def _find_top_candidate(self, root):
// # After we've calculated scores, loop through all of the possible
// # candidate nodes we found and find the one with the highest score.
// top_candidate = None
// top_candidate_score = 0
// # Note: ./#<{(| is faster than ./#<{(|[@score], believe it or not.
// for candidate in root.xpath('./#<{(|'):
//
// if candidate.tag in constants.NON_TOP_CANDIDATE_TAGS:
// continue
//
// candidate_score = self._get_score(candidate)
// if top_candidate is None or candidate_score > top_candidate_score:
// top_candidate = candidate
// top_candidate_score = self._get_score(top_candidate)
//
//
// # If we still have no candidate, just use the body
// if top_candidate is None or len(inner_text(top_candidate)) < 250:
// to_ret = root.find('body')
// if to_ret is None:
// to_ret = root.xpath('.')[0]
// elif top_candidate.getparent() is not None:
// # Now that we have a top_candidate, look through the siblings of
// # it to see if any of them are decently scored. If they are, they
// # may be split parts of the content (Like two divs, a preamble and
// # a body.) Example:
// # http://articles.latimes.com/2009/oct/14/business/fi-bigtvs14
// to_ret = E.DIV()
// sibling_score_threshold = max(10, top_candidate_score * 0.2)
// for child in top_candidate.getparent().iterchildren():
// if not isinstance(child.tag, basestring):
// continue
//
// if self._get_score(child):
// append = False
//
// if child == top_candidate:
// to_ret.append(child)
// continue
//
// density = link_density(child)
// content_bonus = 0
//
// # If the sibling has a very low link density, give a small
// # bonus.
// if density < 0.05:
// content_bonus += 20
//
// # If it's high, give it a penalty
// if density >= 0.5:
// content_bonus -= 20
//
// # If sibling nodes and top candidates have the exact same
// # className, give a bonus
// if child.get('class', False) == top_candidate.get('class'):
// content_bonus += top_candidate_score * 0.2
//
// sibling_score = self._get_score(child) + content_bonus
// if sibling_score >= sibling_score_threshold:
// append = True
// elif child.tag == 'p':
// child_content = child.text_content()
// child_content_len = len(child_content)
//
// if child_content_len > 80 and density < 0.25:
// append = True
// elif (child_content_len <= 80 and density == 0 and
// has_sentence_end(child_content)):
// append = True
//
// if append:
// to_ret.append(child)
// else:
// to_ret = top_candidate
//
// return to_ret
//
// def extract_clean_node(self, article, clean_conditionally=False):
// """ Clean our article content, returning a new, cleaned node. """
// doc = deepcopy(article)
//
// # Rewrite the tag name to div if it's a top level node like body or
// # html to avoid later complications with multiple body tags.
// if doc.tag in ['html','body']:
// doc.tag = 'div'
//
// for img in doc.xpath('.//img'):
// try:
// img_height = int(img.attrib.get('height', 20))
// img_width = int(img.attrib.get('width', 20))
// if img_height < 10 or img_width < 10:
// # Remove images that explicitly have very small heights or
// # widths, because they are most likely shims or icons,
// # which aren't very useful for reading.
// img.drop_tree()
// elif 'height' in img.attrib:
// # Don't ever specify a height on images, so that we can
// # scale with respect to width without screwing up the
// # aspect ratio.
// del img.attrib['height']
// except:
// pass
//
// # Drop certain tags like <title>, etc
// # This is -mostly- for cleanliness, not security. The lxml Cleaner
// # method in Resource does most of the security stuff for us.
// for tag in doc.xpath('.//' + ' | .//'.join(constants.STRIP_OUTPUT_TAGS)):
// tag.drop_tree()
//
// # Drop spacer images
// spacer_path = './/img[re:test(@src, "trans|transparent|spacer|blank", "i")]'
// for tag in doc.xpath(spacer_path, namespaces={'re': constants.RE_NS}):
// tag.drop_tree()
//
// # H1 tags are typically the article title, which should be extracted
// # by the title extractor instead. If there's less than 3 of them (<3),
// # strip them. Otherwise, turn 'em into H2s.
// hOnes = doc.xpath('.//h1')
// if len(hOnes) < 3:
// for e in hOnes:
// e.drop_tree()
// else:
// for e in hOnes:
// e.tag = 'h2'
//
// headers = doc.xpath('.//h2 | .//h3 | .//h4 | .//h5 | .//h6')
// for header in headers:
// drop_header = False
//
// # Remove any headers that are before any p tags in the
// # document. This probably means that it was part of the title, a
// # subtitle or something else extraneous like a datestamp or byline,
// # all of which should be handled by other metadata handling.
// no_previous_ps = int(header.xpath("count(preceding::p[1])")) == 0
// if no_previous_ps:
// similar_header_count = int(doc.xpath('count(.//%s)' % header.tag))
// if similar_header_count < 3:
// drop_header = True
//
// # Remove any headers that match the title exactly.
// if inner_text(header) == self.title:
// drop_header = True
//
// # If this header has a negative weight, it's probably junk.
// # Get rid of it.
// if self.get_weight(header) < 0:
// drop_header = True
//
// if drop_header:
// try:
// header.drop_tree()
// except AssertionError:
// # No parent exists for this node, so just blank it out.
// header.text = ''
//
// for tag in doc.xpath('./#<{(|[@style or @align]'):
// try:
// del tag.attrib['style']
// except KeyError:
// pass
// try:
// del tag.attrib['align']
// except KeyError:
// pass
//
// for para in doc.xpath('.//p'):
// # We have a blank tag
// if (len(inner_text(para)) < 3 and
// len(para.xpath('.//img')) == 0 and
// len(para.xpath('.//iframe')) == 0):
// para.drop_tree()
//
// if clean_conditionally:
// # We used to clean UL's and OL's here, but it was leading to
// # too many in-article lists being removed. Consider a better
// # way to detect menus particularly and remove them.
// self._clean_conditionally(doc, ['ul', 'ol', 'table', 'div'])
//
// return doc
//
// def _clean_conditionally(self, doc, tags):
// """Given a doc, clean it of some superfluous content specified by
// tags. Things like forms, ads, etc.
//
// Tags is an array of tag name's to search through. (like div, form,
// etc)
//
// Return this same doc.
// """
// for node in doc.xpath('.//' + ' | .//'.join(tags)):
//
// node_is_list = node.tag in ('ul', 'ol')
//
// weight = self._get_score(node)
// if node.getparent() is None:
// continue
// if weight < 0:
// node.drop_tree()
// else:
// node_content = inner_text(node)
// if node_content.count(',') < 10:
// remove_node = False
// p_count = int(node.xpath('count(.//p)'))
// img_count = int(node.xpath('count(.//img)'))
// input_count = int(node.xpath('count(.//input)'))
// script_count = int(node.xpath('count(.//script)'))
// density = link_density(node)
// content_length = len(inner_text(node))
//
// # Looks like a form, too many inputs.
// if input_count > (p_count / 3):
// remove_node = True
//
// # Content is too short, and there are no images, so
// # this is probably junk content.
// elif content_length < 25 and img_count == 0:
// remove_node = True
//
// # Too high of link density, is probably a menu or
// # something similar.
// elif (weight < 25 and
// density > 0.2 and
// content_length > 75):
// remove_node = True
//
// # Too high of a link density, despite the score being
// # high.
// elif weight >= 25 and density > 0.5:
// remove_node = True
// # Don't remove the node if it's a list and the
// # previous sibling starts with a colon though. That
// # means it's probably content.
// if node_is_list:
// previous_sibling = node.getprevious()
// if (previous_sibling is not None and
// inner_text(previous_sibling)[-1:] == ':'):
// remove_node = False
//
// # Too many script tags, not enough content.
// elif script_count > 0 and len(node_content) < 150:
// remove_node = True
//
// # Explicitly save entry-content-asset tags, which are
// # noted as valuable in the Publisher guidelines. For now
// # this works everywhere. We may want to consider making
// # this less of a sure-thing later.
// if 'entry-content-asset' in node.get('class', ''):
// remove_node = False
//
// if remove_node:
// node.drop_tree()
// return doc
export default GenericContentExtractor

@ -0,0 +1,12 @@
import GenericContentExtractor from './content-extractor.js'
const GenericExtractor = {
parse: (html) => {
return {
content: GenericContentExtractor.parse(html)
}
}
}
export default GenericExtractor

@ -0,0 +1,7 @@
import fs from 'fs'
import GenericExtractor from './extractor/generic/index.js'
const html = fs.readFileSync('../fixtures/wired.html', 'utf-8')
const result = GenericExtractor.parse(html)
console.log(result)
Loading…
Cancel
Save