diff --git a/dist/mercury.js b/dist/mercury.js index c6025137..8d5105e6 100644 --- a/dist/mercury.js +++ b/dist/mercury.js @@ -874,7 +874,63 @@ var LittleThingsExtractor = { clean: [] }, - lead_image_url: null, + lead_image_url: { + selectors: [['meta[name="og:image"]', 'value']] + }, + + next_page_url: null, + + excerpt: null +}; + +// Rename CustomExtractor +// to fit your publication +// (e.g., NYTimesExtractor) +var PoliticoExtractor = { + domain: 'www.politico.com', + title: { + selectors: [ + // enter title selectors + ['meta[name="og:title"]', 'value']] + }, + + author: { + selectors: [ + // enter author selectors + + ] + }, + + content: { + selectors: [ + // enter content selectors + '.story-main-content', '.content-group', '.story-core', '.story-text'], + + // Is there anything in the content you selected that needs transformed + // before it's consumable content? E.g., unusual lazy loaded images + transforms: [], + + // Is there anything that is in the result that shouldn't be? + // The clean selectors will remove anything that matches from + // the result + clean: ['figcaption'] + }, + + date_published: { + selectors: [ + // enter date selectors + ['.timestamp[time="datetime"]']] + }, + + lead_image_url: { + selectors: [ + // enter lead_image_url selectors + ['meta[name="og:image"]', 'value']] + }, + + dek: { + selectors: [['meta[name="description"]', 'value']] + }, next_page_url: null, @@ -894,7 +950,8 @@ var Extractors = { 'www.yahoo.com': YahooExtractor, 'www.buzzfeed.com': BuzzfeedExtractor, 'fandom.wikia.com': WikiaExtractor, - 'www.littlethings.com': LittleThingsExtractor + 'www.littlethings.com': LittleThingsExtractor, + 'www.politico.com': PoliticoExtractor }; diff --git a/dist/mercury.js.map b/dist/mercury.js.map index b4fc3589..c35acdfb 100644 --- a/dist/mercury.js.map +++ b/dist/mercury.js.map @@ -1 +1 @@ -{"version":3,"file":null,"sources":["../src/utils/range.js","../src/utils/validate-url.js","../src/utils/errors.js","../src/resource/utils/constants.js","../src/resource/utils/fetch-resource.js","../src/resource/utils/dom/normalize-meta-tags.js","../src/resource/utils/dom/constants.js","../src/resource/utils/dom/convert-lazy-loaded-images.js","../src/resource/utils/dom/clean.js","../src/resource/index.js","../src/extractors/custom/nymag.com/index.js","../src/extractors/custom/blogspot.com/index.js","../src/extractors/custom/wikipedia.org/index.js","../src/extractors/custom/twitter.com/index.js","../src/extractors/custom/www.nytimes.com/index.js","../src/extractors/custom/www.theatlantic.com/index.js","../src/extractors/custom/www.newyorker.com/index.js","../src/extractors/custom/www.wired.com/index.js","../src/extractors/custom/www.msn.com/index.js","../src/extractors/custom/www.yahoo.com/index.js","../src/extractors/custom/www.buzzfeed.com/index.js","../src/extractors/custom/fandom.wikia.com/index.js","../src/extractors/custom/www.littlethings.com/index.js","../src/extractors/all.js","../src/utils/dom/constants.js","../src/utils/dom/strip-unlikely-candidates.js","../src/utils/dom/brs-to-ps.js","../src/utils/dom/paragraphize.js","../src/utils/dom/convert-to-paragraphs.js","../src/utils/dom/convert-node-to.js","../src/utils/dom/clean-images.js","../src/utils/dom/strip-junk-tags.js","../src/utils/dom/clean-h-ones.js","../src/utils/dom/clean-attributes.js","../src/utils/dom/remove-empty.js","../src/extractors/generic/content/scoring/constants.js","../src/extractors/generic/content/scoring/get-weight.js","../src/extractors/generic/content/scoring/get-score.js","../src/extractors/generic/content/scoring/score-commas.js","../src/extractors/generic/content/scoring/score-length.js","../src/extractors/generic/content/scoring/score-paragraph.js","../src/extractors/generic/content/scoring/set-score.js","../src/extractors/generic/content/scoring/add-score.js","../src/extractors/generic/content/scoring/add-to-parent.js","../src/extractors/generic/content/scoring/get-or-init-score.js","../src/extractors/generic/content/scoring/score-node.js","../src/extractors/generic/content/scoring/score-content.js","../src/utils/text/normalize-spaces.js","../src/utils/text/extract-from-url.js","../src/utils/text/constants.js","../src/utils/text/page-num-from-url.js","../src/utils/text/remove-anchor.js","../src/utils/text/article-base-url.js","../src/utils/text/has-sentence-end.js","../src/extractors/generic/content/scoring/merge-siblings.js","../src/extractors/generic/content/scoring/find-top-candidate.js","../src/utils/dom/clean-tags.js","../src/utils/dom/clean-headers.js","../src/utils/dom/rewrite-top-level.js","../src/utils/dom/make-links-absolute.js","../src/utils/dom/link-density.js","../src/utils/dom/extract-from-meta.js","../src/utils/dom/extract-from-selectors.js","../src/utils/dom/strip-tags.js","../src/utils/dom/within-comment.js","../src/utils/dom/node-is-sufficient.js","../src/utils/dom/is-wordpress.js","../src/cleaners/constants.js","../src/cleaners/author.js","../src/cleaners/lead-image-url.js","../src/cleaners/dek.js","../src/cleaners/date-published.js","../src/cleaners/content.js","../src/cleaners/title.js","../src/cleaners/resolve-split-title.js","../src/cleaners/index.js","../src/extractors/generic/content/extract-best-node.js","../src/extractors/generic/content/extractor.js","../src/extractors/generic/title/constants.js","../src/extractors/generic/title/extractor.js","../src/extractors/generic/author/constants.js","../src/extractors/generic/author/extractor.js","../src/extractors/generic/date-published/constants.js","../src/extractors/generic/date-published/extractor.js","../src/extractors/generic/dek/extractor.js","../src/extractors/generic/lead-image-url/constants.js","../src/extractors/generic/lead-image-url/score-image.js","../src/extractors/generic/lead-image-url/extractor.js","../src/extractors/generic/next-page-url/scoring/utils/score-similarity.js","../src/extractors/generic/next-page-url/scoring/utils/score-link-text.js","../src/extractors/generic/next-page-url/scoring/utils/score-page-in-link.js","../src/extractors/generic/next-page-url/scoring/constants.js","../src/extractors/generic/next-page-url/scoring/utils/score-extraneous-links.js","../src/extractors/generic/next-page-url/scoring/utils/score-by-parents.js","../src/extractors/generic/next-page-url/scoring/utils/score-prev-link.js","../src/extractors/generic/next-page-url/scoring/utils/should-score.js","../src/extractors/generic/next-page-url/scoring/utils/score-base-url.js","../src/extractors/generic/next-page-url/scoring/utils/score-next-link-text.js","../src/extractors/generic/next-page-url/scoring/utils/score-cap-links.js","../src/extractors/generic/next-page-url/scoring/score-links.js","../src/extractors/generic/next-page-url/extractor.js","../src/extractors/generic/url/constants.js","../src/extractors/generic/url/extractor.js","../src/extractors/generic/excerpt/constants.js","../src/extractors/generic/excerpt/extractor.js","../src/extractors/generic/word-count/extractor.js","../src/extractors/generic/index.js","../src/extractors/get-extractor.js","../src/extractors/root-extractor.js","../src/extractors/collect-all-pages.js","../src/mercury.js"],"sourcesContent":["export default function* range(start = 1, end = 1) {\n while (start <= end) {\n yield start += 1;\n }\n}\n","// extremely simple url validation as a first step\nexport default function validateUrl({ hostname }) {\n // If this isn't a valid url, return an error message\n return !!hostname;\n}\n","const Errors = {\n badUrl: {\n error: true,\n messages: 'The url parameter passed does not look like a valid URL. Please check your data and try again.',\n },\n};\n\nexport default Errors;\n","export const REQUEST_HEADERS = {\n 'User-Agent': 'Readability - http://readability.com/about/',\n};\n\n// The number of milliseconds to attempt to fetch a resource before timing out.\nexport const FETCH_TIMEOUT = 10000;\n\n// Content types that we do not extract content from\nconst BAD_CONTENT_TYPES = [\n 'audio/mpeg',\n 'image/gif',\n 'image/jpeg',\n 'image/jpg',\n];\n\nexport const BAD_CONTENT_TYPES_RE = new RegExp(`^(${BAD_CONTENT_TYPES.join('|')})$`, 'i');\n\n\n// Use this setting as the maximum size an article can be\n// for us to attempt parsing. Defaults to 5 MB.\nexport const MAX_CONTENT_LENGTH = 5242880;\n\n// Turn the global proxy on or off\n// Proxying is not currently enabled in Python source\n// so not implementing logic in port.\nexport const PROXY_DOMAINS = false;\nexport const REQUESTS_PROXIES = {\n http: 'http://38.98.105.139:33333',\n https: 'http://38.98.105.139:33333',\n};\n\nexport const DOMAINS_TO_PROXY = [\n 'nih.gov',\n 'gutenberg.org',\n];\n","import 'babel-polyfill';\n\nimport URL from 'url';\nimport request from 'request';\nimport { Errors } from 'utils';\n\nimport {\n REQUEST_HEADERS,\n FETCH_TIMEOUT,\n BAD_CONTENT_TYPES_RE,\n MAX_CONTENT_LENGTH,\n} from './constants';\n\nfunction get(options) {\n return new Promise((resolve, reject) => {\n request(options, (err, response, body) => {\n if (err) {\n reject(err);\n } else {\n resolve({ body, response });\n }\n });\n });\n}\n\n// Evaluate a response to ensure it's something we should be keeping.\n// This does not validate in the sense of a response being 200 level or\n// not. Validation here means that we haven't found reason to bail from\n// further processing of this url.\n\nexport function validateResponse(response, parseNon2xx = false) {\n // Check if we got a valid status code\n if (response.statusMessage !== 'OK') {\n if (!response.statusCode) {\n throw new Error(\n `Unable to fetch content. Original exception was ${response.error}`\n );\n } else if (!parseNon2xx) {\n throw new Error(\n `Resource returned a response status code of ${response.statusCode} and resource was instructed to reject non-2xx level status codes.`\n );\n }\n }\n\n const {\n 'content-type': contentType,\n 'content-length': contentLength,\n } = response.headers;\n\n // Check that the content is not in BAD_CONTENT_TYPES\n if (BAD_CONTENT_TYPES_RE.test(contentType)) {\n throw new Error(\n `Content-type for this resource was ${contentType} and is not allowed.`\n );\n }\n\n // Check that the content length is below maximum\n if (contentLength > MAX_CONTENT_LENGTH) {\n throw new Error(\n `Content for this resource was too large. Maximum content length is ${MAX_CONTENT_LENGTH}.`\n );\n }\n\n return true;\n}\n\n// Grabs the last two pieces of the URL and joins them back together\n// This is to get the 'livejournal.com' from 'erotictrains.livejournal.com'\nexport function baseDomain({ host }) {\n return host.split('.').slice(-2).join('.');\n}\n\n// Set our response attribute to the result of fetching our URL.\n// TODO: This should gracefully handle timeouts and raise the\n// proper exceptions on the many failure cases of HTTP.\n// TODO: Ensure we are not fetching something enormous. Always return\n// unicode content for HTML, with charset conversion.\n\nexport default async function fetchResource(url, parsedUrl) {\n parsedUrl = parsedUrl || URL.parse(encodeURI(url));\n\n const options = {\n url: parsedUrl,\n headers: { ...REQUEST_HEADERS },\n timeout: FETCH_TIMEOUT,\n // Don't set encoding; fixes issues\n // w/gzipped responses\n encoding: null,\n // Accept cookies\n jar: true,\n // Accept and decode gzip\n gzip: true,\n // Follow any redirect\n followAllRedirects: true,\n };\n\n const { response, body } = await get(options);\n\n try {\n validateResponse(response);\n return { body, response };\n } catch (e) {\n return Errors.badUrl;\n }\n}\n","function convertMetaProp($, from, to) {\n $(`meta[${from}]`).each((_, node) => {\n const $node = $(node);\n\n const value = $node.attr(from);\n $node.attr(to, value);\n $node.removeAttr(from);\n });\n\n return $;\n}\n\n// For ease of use in extracting from meta tags,\n// replace the \"content\" attribute on meta tags with the\n// \"value\" attribute.\n//\n// In addition, normalize 'property' attributes to 'name' for ease of\n// querying later. See, e.g., og or twitter meta tags.\n\nexport default function normalizeMetaTags($) {\n $ = convertMetaProp($, 'content', 'value');\n $ = convertMetaProp($, 'property', 'name');\n return $;\n}\n","export const IS_LINK = new RegExp('https?://', 'i');\nexport const IS_IMAGE = new RegExp('.(png|gif|jpe?g)', 'i');\n\nexport const TAGS_TO_REMOVE = [\n 'script',\n 'style',\n 'form',\n].join(',');\n","import 'babel-polyfill';\n\nimport {\n IS_LINK,\n IS_IMAGE,\n} from './constants';\n\n// Convert all instances of images with potentially\n// lazy loaded images into normal images.\n// Many sites will have img tags with no source, or an image tag with a src\n// attribute that a is a placeholer. We need to be able to properly fill in\n// the src attribute so the images are no longer lazy loaded.\nexport default function convertLazyLoadedImages($) {\n $('img').each((_, img) => {\n Reflect.ownKeys(img.attribs).forEach((attr) => {\n const value = img.attribs[attr];\n\n if (attr !== 'src' && IS_LINK.test(value) &&\n IS_IMAGE.test(value)) {\n $(img).attr('src', value);\n }\n });\n });\n\n return $;\n}\n","import { TAGS_TO_REMOVE } from './constants';\n\nfunction isComment(index, node) {\n return node.type === 'comment';\n}\n\nfunction cleanComments($) {\n $.root().find('*')\n .contents()\n .filter(isComment)\n .remove();\n\n return $;\n}\n\nexport default function clean($) {\n $(TAGS_TO_REMOVE).remove();\n\n $ = cleanComments($);\n return $;\n}\n","import 'babel-polyfill';\n\nimport cheerio from 'cheerio';\n\nimport { fetchResource } from './utils';\nimport {\n normalizeMetaTags,\n convertLazyLoadedImages,\n clean,\n} from './utils/dom';\n\nconst Resource = {\n\n // Create a Resource.\n //\n // :param url: The URL for the document we should retrieve.\n // :param response: If set, use as the response rather than\n // attempting to fetch it ourselves. Expects a\n // string.\n async create(url, preparedResponse, parsedUrl) {\n let result;\n\n if (preparedResponse) {\n const validResponse = {\n statusMessage: 'OK',\n statusCode: 200,\n headers: {\n 'content-type': 'text/html',\n 'content-length': 500,\n },\n };\n\n result = { body: preparedResponse, response: validResponse };\n } else {\n result = await fetchResource(url, parsedUrl);\n }\n\n if (result.error) {\n return result;\n }\n\n return this.generateDoc(result);\n },\n\n generateDoc({ body: content, response }) {\n const { 'content-type': contentType } = response.headers;\n\n // TODO: Implement is_text function from\n // https://github.com/ReadabilityHoldings/readability/blob/8dc89613241d04741ebd42fa9fa7df1b1d746303/readability/utils/text.py#L57\n if (!contentType.includes('html') &&\n !contentType.includes('text')) {\n throw new Error('Content does not appear to be text.');\n }\n\n let $ = cheerio.load(content, { normalizeWhitespace: true });\n\n if ($.root().children().length === 0) {\n throw new Error('No children, likely a bad parse.');\n }\n\n $ = normalizeMetaTags($);\n $ = convertLazyLoadedImages($);\n $ = clean($);\n\n return $;\n },\n};\n\nexport default Resource;\n","export const NYMagExtractor = {\n domain: 'nymag.com',\n content: {\n // Order by most likely. Extractor will stop on first occurrence\n selectors: [\n 'div.article-content',\n 'section.body',\n 'article.article',\n ],\n\n // Selectors to remove from the extracted content\n clean: [\n '.ad',\n '.single-related-story',\n ],\n\n // Object of tranformations to make on matched elements\n // Each key is the selector, each value is the tag to\n // transform to.\n // If a function is given, it should return a string\n // to convert to or nothing (in which case it will not perform\n // the transformation.\n transforms: {\n // Convert h1s to h2s\n h1: 'h2',\n\n // Convert lazy-loaded noscript images to figures\n noscript: ($node) => {\n const $children = $node.children();\n if ($children.length === 1 && $children.get(0).tagName === 'img') {\n return 'figure';\n }\n\n return null;\n },\n },\n },\n\n title: {\n selectors: [\n 'h1.lede-feature-title',\n 'h1.headline-primary',\n 'h1',\n ],\n },\n\n author: {\n selectors: [\n '.by-authors',\n '.lede-feature-author',\n ],\n },\n\n dek: {\n selectors: [\n '.lede-feature-teaser',\n ],\n },\n\n date_published: {\n selectors: [\n ['time.article-timestamp[datetime]', 'datetime'],\n 'time.article-timestamp',\n ],\n },\n};\n","export const BloggerExtractor = {\n domain: 'blogspot.com',\n content: {\n // Blogger is insane and does not load its content\n // initially in the page, but it's all there\n // in noscript\n selectors: [\n '.post-content noscript',\n ],\n\n // Selectors to remove from the extracted content\n clean: [\n ],\n\n // Convert the noscript tag to a div\n transforms: {\n noscript: 'div',\n },\n },\n\n author: {\n selectors: [\n '.post-author-name',\n ],\n },\n\n title: {\n selectors: [\n 'h2.title',\n ],\n },\n\n date_published: {\n selectors: [\n 'span.publishdate',\n ],\n },\n};\n","export const WikipediaExtractor = {\n domain: 'wikipedia.org',\n content: {\n selectors: [\n '#mw-content-text',\n ],\n\n defaultCleaner: false,\n\n // transform top infobox to an image with caption\n transforms: {\n '.infobox img': ($node) => {\n const $parent = $node.parents('.infobox');\n // Only prepend the first image in .infobox\n if ($parent.children('img').length === 0) {\n $parent.prepend($node);\n }\n },\n '.infobox caption': 'figcaption',\n '.infobox': 'figure',\n },\n\n // Selectors to remove from the extracted content\n clean: [\n '.mw-editsection',\n 'figure tr, figure td, figure tbody',\n '#toc',\n '.navbox',\n ],\n\n },\n\n author: 'Wikipedia Contributors',\n\n title: {\n selectors: [\n 'h2.title',\n ],\n },\n\n date_published: {\n selectors: [\n '#footer-info-lastmod',\n ],\n },\n\n};\n","export const TwitterExtractor = {\n domain: 'twitter.com',\n\n content: {\n transforms: {\n // We're transforming essentially the whole page here.\n // Twitter doesn't have nice selectors, so our initial\n // selector grabs the whole page, then we're re-writing\n // it to fit our needs before we clean it up.\n '.permalink[role=main]': ($node, $) => {\n const tweets = $node.find('.tweet');\n const $tweetContainer = $('
');\n $tweetContainer.append(tweets);\n $node.replaceWith($tweetContainer);\n },\n\n // Twitter wraps @ with s, which\n // renders as a strikethrough\n s: 'span',\n },\n\n selectors: [\n '.permalink[role=main]',\n ],\n\n defaultCleaner: false,\n\n clean: [\n '.stream-item-footer',\n 'button',\n '.tweet-details-fixer',\n ],\n },\n\n author: {\n selectors: [\n '.tweet.permalink-tweet .username',\n ],\n },\n\n date_published: {\n selectors: [\n ['.permalink-tweet ._timestamp[data-time-ms]', 'data-time-ms'],\n // '.tweet.permalink-tweet .metadata',\n ],\n },\n\n};\n","export const NYTimesExtractor = {\n title: {\n selectors: [\n '.g-headline',\n 'h1.headline',\n ],\n },\n\n author: {\n selectors: [\n '.g-byline',\n '.byline',\n ],\n },\n\n content: {\n selectors: [\n 'div.g-blocks',\n 'article#story',\n ],\n\n defaultCleaner: false,\n\n transforms: {\n 'img.g-lazy': ($node) => {\n let src = $node.attr('src');\n // const widths = $node.attr('data-widths')\n // .slice(1)\n // .slice(0, -1)\n // .split(',');\n // if (widths.length) {\n // width = widths.slice(-1);\n // } else {\n // width = '900';\n // }\n const width = 640;\n\n src = src.replace('{{size}}', width);\n $node.attr('src', src);\n },\n },\n\n clean: [\n '.ad',\n 'header#story-header',\n '.story-body-1 .lede.video',\n '.visually-hidden',\n '#newsletter-promo',\n '.promo',\n '.comments-button',\n '.hidden',\n ],\n },\n\n date_published: null,\n\n lead_image_url: null,\n\n dek: null,\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\nexport const TheAtlanticExtractor = {\n domain: 'www.theatlantic.com',\n title: {\n selectors: [\n 'h1.hed',\n ],\n },\n\n author: {\n selectors: [\n 'article#article .article-cover-extra .metadata .byline a',\n ],\n },\n\n content: {\n selectors: [\n '.article-body',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: null,\n\n lead_image_url: null,\n\n dek: null,\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const NewYorkerExtractor = {\n domain: 'www.newyorker.com',\n title: {\n selectors: [\n 'h1.title',\n ],\n },\n\n author: {\n selectors: [\n '.contributors',\n ],\n },\n\n content: {\n selectors: [\n 'div#articleBody',\n 'div.articleBody',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"og:description\"]', 'value'],\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const WiredExtractor = {\n domain: 'www.wired.com',\n title: {\n selectors: [\n 'h1.post-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'a[rel=\"author\"]',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n 'article.content',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.visually-hidden',\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[itemprop=\"datePublished\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"og:description\"]', 'value'],\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const MSNExtractor = {\n domain: 'www.msn.com',\n title: {\n selectors: [\n 'h1',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'span.authorname-txt',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n 'div.richtext',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n 'span.caption',\n\n ],\n },\n\n date_published: {\n selectors: [\n 'span.time',\n ],\n },\n\n lead_image_url: {\n selectors: [\n\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"description\"]', 'value'],\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const YahooExtractor = {\n domain: 'www.yahoo.com',\n title: {\n selectors: [\n 'header.canvas-header',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'span.provider-name',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n '.content-canvas',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n '.figure-caption',\n\n ],\n },\n\n date_published: {\n selectors: [\n 'time.date',\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"og:description\"]', 'value'],\n // enter dek selectors\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const BuzzfeedExtractor = {\n domain: 'www.buzzfeed.com',\n title: {\n selectors: [\n 'h1[id=\"post-title\"]',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n 'a[data-action=\"user/username\"]', 'byline__author',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n '.buzz-datetime',\n // enter author selectors\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"description\"]', 'value'],\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const WikiaExtractor = {\n domain: 'fandom.wikia.com',\n title: {\n selectors: [\n 'h1.entry-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n '.author vcard', '.fn',\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n '.grid-content',\n '.entry-content',\n // enter content selectors\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n date_published: {\n selectors: [\n ['meta[name=\"article:published_time\"]', 'value'],\n ],\n },\n\n lead_image_url: {\n selectors: [\n ['meta[name=\"og:image\"]', 'value'],\n ],\n },\n\n dek: {\n selectors: [\n ['meta[name=\"og:description\"]', 'value'],\n ],\n },\n\n next_page_url: null,\n\n excerpt: null,\n};\n","// Rename CustomExtractor\n// to fit your publication\n// (e.g., NYTimesExtractor)\nexport const LittleThingsExtractor = {\n domain: 'www.littlethings.com',\n title: {\n selectors: [\n 'h1.post-title',\n // enter title selectors\n ],\n },\n\n author: {\n selectors: [\n ['meta[name=\"author\"]', 'value'],\n // enter author selectors\n ],\n },\n\n content: {\n selectors: [\n // enter content selectors\n '.mainContentIntro',\n '.content-wrapper',\n ],\n\n // Is there anything in the content you selected that needs transformed\n // before it's consumable content? E.g., unusual lazy loaded images\n transforms: [\n ],\n\n // Is there anything that is in the result that shouldn't be?\n // The clean selectors will remove anything that matches from\n // the result\n clean: [\n\n ],\n },\n\n lead_image_url: null,\n\n next_page_url: null,\n\n excerpt: null,\n};\n","import { NYMagExtractor } from './custom/nymag.com';\nimport { BloggerExtractor } from './custom/blogspot.com';\nimport { WikipediaExtractor } from './custom/wikipedia.org';\nimport { TwitterExtractor } from './custom/twitter.com';\nimport { NYTimesExtractor } from './custom/www.nytimes.com';\nimport { TheAtlanticExtractor } from './custom/www.theatlantic.com';\nimport { NewYorkerExtractor } from './custom/www.newyorker.com';\nimport { WiredExtractor } from './custom/www.wired.com';\nimport { MSNExtractor } from './custom/www.msn.com';\nimport { YahooExtractor } from './custom/www.yahoo.com';\nimport { BuzzfeedExtractor } from './custom/www.buzzfeed.com';\nimport { WikiaExtractor } from './custom/fandom.wikia.com';\nimport { LittleThingsExtractor } from './custom/www.littlethings.com';\n\n\nconst Extractors = {\n 'nymag.com': NYMagExtractor,\n 'blogspot.com': BloggerExtractor,\n 'wikipedia.org': WikipediaExtractor,\n 'twitter.com': TwitterExtractor,\n 'www.nytimes.com': NYTimesExtractor,\n 'www.theatlantic.com': TheAtlanticExtractor,\n 'www.newyorker.com': NewYorkerExtractor,\n 'www.wired.com': WiredExtractor,\n 'www.msn.com': MSNExtractor,\n 'www.yahoo.com': YahooExtractor,\n 'www.buzzfeed.com': BuzzfeedExtractor,\n 'fandom.wikia.com': WikiaExtractor,\n 'www.littlethings.com': LittleThingsExtractor,\n\n};\n\nexport default Extractors;\n","// Spacer images to be removed\nexport const SPACER_RE = new RegExp('trans|transparent|spacer|blank', 'i');\n\n// A list of tags to strip from the output if we encounter them.\nexport const STRIP_OUTPUT_TAGS = [\n 'title',\n 'script',\n 'noscript',\n 'link',\n 'style',\n 'hr',\n 'embed',\n 'iframe',\n 'object',\n];\n\n// cleanAttributes\nexport const REMOVE_ATTRS = ['style', 'align'];\nexport const REMOVE_ATTR_SELECTORS = REMOVE_ATTRS.map(selector => `[${selector}]`);\nexport const REMOVE_ATTR_LIST = REMOVE_ATTRS.join(',');\nexport const WHITELIST_ATTRS = ['src', 'srcset', 'href', 'class', 'id', 'alt', 'score'];\nexport const WHITELIST_ATTRS_RE = new RegExp(`^(${WHITELIST_ATTRS.join('|')})$`, 'i');\n\n// removeEmpty\nexport const REMOVE_EMPTY_TAGS = ['p'];\nexport const REMOVE_EMPTY_SELECTORS = REMOVE_EMPTY_TAGS.map(tag => `${tag}:empty`).join(',');\n\n// cleanTags\nexport const CLEAN_CONDITIONALLY_TAGS = ['ul', 'ol', 'table', 'div', 'button', 'form'].join(',');\n\n// cleanHeaders\nconst HEADER_TAGS = ['h2', 'h3', 'h4', 'h5', 'h6'];\nexport const HEADER_TAG_LIST = HEADER_TAGS.join(',');\n\n\n// // CONTENT FETCHING CONSTANTS ////\n\n// A list of strings that can be considered unlikely candidates when\n// extracting content from a resource. These strings are joined together\n// and then tested for existence using re:test, so may contain simple,\n// non-pipe style regular expression queries if necessary.\nexport const UNLIKELY_CANDIDATES_BLACKLIST = [\n 'ad-break',\n 'adbox',\n 'advert',\n 'addthis',\n 'agegate',\n 'aux',\n 'blogger-labels',\n 'combx',\n 'comment',\n 'conversation',\n 'disqus',\n 'entry-unrelated',\n 'extra',\n 'foot',\n // 'form', // This is too generic, has too many false positives\n 'header',\n 'hidden',\n 'loader',\n 'login', // Note: This can hit 'blogindex'.\n 'menu',\n 'meta',\n 'nav',\n 'outbrain',\n 'pager',\n 'pagination',\n 'predicta', // readwriteweb inline ad box\n 'presence_control_external', // lifehacker.com container full of false positives\n 'popup',\n 'printfriendly',\n 'related',\n 'remove',\n 'remark',\n 'rss',\n 'share',\n 'shoutbox',\n 'sidebar',\n 'sociable',\n 'sponsor',\n 'taboola',\n 'tools',\n];\n\n// A list of strings that can be considered LIKELY candidates when\n// extracting content from a resource. Essentially, the inverse of the\n// blacklist above - if something matches both blacklist and whitelist,\n// it is kept. This is useful, for example, if something has a className\n// of \"rss-content entry-content\". It matched 'rss', so it would normally\n// be removed, however, it's also the entry content, so it should be left\n// alone.\n//\n// These strings are joined together and then tested for existence using\n// re:test, so may contain simple, non-pipe style regular expression queries\n// if necessary.\nexport const UNLIKELY_CANDIDATES_WHITELIST = [\n 'and',\n 'article',\n 'body',\n 'blogindex',\n 'column',\n 'content',\n 'entry-content-asset',\n 'format', // misuse of form\n 'hfeed',\n 'hentry',\n 'hatom',\n 'main',\n 'page',\n 'posts',\n 'shadow',\n];\n\n// A list of tags which, if found inside, should cause a to NOT\n// be turned into a paragraph tag. Shallow div tags without these elements\n// should be turned into tags.\nexport const DIV_TO_P_BLOCK_TAGS = [\n 'a',\n 'blockquote',\n 'dl',\n 'div',\n 'img',\n 'p',\n 'pre',\n 'table',\n].join(',');\n\n// A list of tags that should be ignored when trying to find the top candidate\n// for a document.\nexport const NON_TOP_CANDIDATE_TAGS = [\n 'br',\n 'b',\n 'i',\n 'label',\n 'hr',\n 'area',\n 'base',\n 'basefont',\n 'input',\n 'img',\n 'link',\n 'meta',\n];\n\nexport const NON_TOP_CANDIDATE_TAGS_RE =\n new RegExp(`^(${NON_TOP_CANDIDATE_TAGS.join('|')})$`, 'i');\n\n// A list of selectors that specify, very clearly, either hNews or other\n// very content-specific style content, like Blogger templates.\n// More examples here: http://microformats.org/wiki/blog-post-formats\nexport const HNEWS_CONTENT_SELECTORS = [\n ['.hentry', '.entry-content'],\n ['entry', '.entry-content'],\n ['.entry', '.entry_content'],\n ['.post', '.postbody'],\n ['.post', '.post_body'],\n ['.post', '.post-body'],\n];\n\nexport const PHOTO_HINTS = [\n 'figure',\n 'photo',\n 'image',\n 'caption',\n];\nexport const PHOTO_HINTS_RE = new RegExp(PHOTO_HINTS.join('|'), 'i');\n\n\n// A list of strings that denote a positive scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const POSITIVE_SCORE_HINTS = [\n 'article',\n 'articlecontent',\n 'instapaper_body',\n 'blog',\n 'body',\n 'content',\n 'entry-content-asset',\n 'entry',\n 'hentry',\n 'main',\n 'Normal',\n 'page',\n 'pagination',\n 'permalink',\n 'post',\n 'story',\n 'text',\n '[-_]copy', // usatoday\n '\\\\Bcopy',\n];\n\n// The above list, joined into a matching regular expression\nexport const POSITIVE_SCORE_RE = new RegExp(POSITIVE_SCORE_HINTS.join('|'), 'i');\n\n// Readability publisher-specific guidelines\nexport const READABILITY_ASSET = new RegExp('entry-content-asset', 'i');\n\n// A list of strings that denote a negative scoring for this content as being\n// an article container. Checked against className and id.\n//\n// TODO: Perhaps have these scale based on their odds of being quality?\nexport const NEGATIVE_SCORE_HINTS = [\n 'adbox',\n 'advert',\n 'author',\n 'bio',\n 'bookmark',\n 'bottom',\n 'byline',\n 'clear',\n 'com-',\n 'combx',\n 'comment',\n 'comment\\\\B',\n 'contact',\n 'copy',\n 'credit',\n 'crumb',\n 'date',\n 'deck',\n 'excerpt',\n 'featured', // tnr.com has a featured_content which throws us off\n 'foot',\n 'footer',\n 'footnote',\n 'graf',\n 'head',\n 'info',\n 'infotext', // newscientist.com copyright\n 'instapaper_ignore',\n 'jump',\n 'linebreak',\n 'link',\n 'masthead',\n 'media',\n 'meta',\n 'modal',\n 'outbrain', // slate.com junk\n 'promo',\n 'pr_', // autoblog - press release\n 'related',\n 'respond',\n 'roundcontent', // lifehacker restricted content warning\n 'scroll',\n 'secondary',\n 'share',\n 'shopping',\n 'shoutbox',\n 'side',\n 'sidebar',\n 'sponsor',\n 'stamp',\n 'sub',\n 'summary',\n 'tags',\n 'tools',\n 'widget',\n];\n// The above list, joined into a matching regular expression\nexport const NEGATIVE_SCORE_RE = new RegExp(NEGATIVE_SCORE_HINTS.join('|'), 'i');\n\n// XPath to try to determine if a page is wordpress. Not always successful.\nexport const IS_WP_SELECTOR = 'meta[name=generator][value^=WordPress]';\n\n// Match a digit. Pretty clear.\nexport const DIGIT_RE = new RegExp('[0-9]');\n\n// A list of words that, if found in link text or URLs, likely mean that\n// this link is not a next page link.\nexport const EXTRANEOUS_LINK_HINTS = [\n 'print',\n 'archive',\n 'comment',\n 'discuss',\n 'e-mail',\n 'email',\n 'share',\n 'reply',\n 'all',\n 'login',\n 'sign',\n 'single',\n 'adx',\n 'entry-unrelated',\n];\nexport const EXTRANEOUS_LINK_HINTS_RE = new RegExp(EXTRANEOUS_LINK_HINTS.join('|'), 'i');\n\n// Match any phrase that looks like it could be page, or paging, or pagination\nexport const PAGE_RE = new RegExp('pag(e|ing|inat)', 'i');\n\n// Match any link text/classname/id that looks like it could mean the next\n// page. Things like: next, continue, >, >>, » but not >|, »| as those can\n// mean last page.\n// export const NEXT_LINK_TEXT_RE = new RegExp('(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))', 'i');\nexport const NEXT_LINK_TEXT_RE = /(next|weiter|continue|>([^\\|]|$)|»([^\\|]|$))/i;\n\n// Match any link text/classname/id that looks like it is an end link: things\n// like \"first\", \"last\", \"end\", etc.\nexport const CAP_LINK_TEXT_RE = new RegExp('(first|last|end)', 'i');\n\n// Match any link text/classname/id that looks like it means the previous\n// page.\nexport const PREV_LINK_TEXT_RE = new RegExp('(prev|earl|old|new|<|«)', 'i');\n\n// Match 2 or more consecutive