diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index 1e24c821..6c23480a 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,23 +1,14 @@ -- **Platform**: -- **Mercury Parser Version**: -- **Node Version (if a Node bug)**: -- **Browser Version (if a browser bug)**: - ## Expected Behavior @@ -32,8 +23,6 @@ problem, keeping it as simple and free of external dependencies as you are able. -1. 2. 3. 4. - ## Detailed Description @@ -45,4 +34,4 @@ problem, keeping it as simple and free of external dependencies as you are able. ## Possible Solution - + diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 13c7e8d7..f7d99fd3 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,5 +4,5 @@ Thanks for submitting a pull request! In order to get this change merged in a timely manner, please provide a short description, review requirements, and a link to the issue this addresses (if applicable). -Contributing Guide: https://github.com/postlight/mercury-parser/blob/master/CONTRIBUTING.md +Contributing Guide: https://github.com/postlight/parser/blob/master/CONTRIBUTING.md --> diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index 91bfc773..5083cbc7 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,8 +1,8 @@ -# The Mercury Code of Conduct +# The Postlight Parser Code of Conduct ## Conduct -**Contact Mercury moderators:** [mercury@postlight.com](mailto:mercury@postlight.com) +**Contact Postlight Parser moderators:** [mercury@postlight.com](mailto:mercury@postlight.com) - We are committed to providing a friendly, safe and welcoming environment for all, regardless of level of experience, gender, gender identity and expression, @@ -31,7 +31,7 @@ - Private harassment is also unacceptable. No matter who you are, if you feel you have been or are being harassed or made uncomfortable by a community member, - please contact one of the channel ops or any of the [Mercury moderation team](mailto:mercury@postlight.com) + please contact one of the channel ops or any of the [Postlight Parser moderation team](mailto:mercury@postlight.com) immediately. Whether you're a regular contributor or a newcomer, we care about making this community a safe place for you and we've got your back. @@ -41,9 +41,9 @@ ## Moderation These are the policies for upholding our community's standards of conduct. If you -feel that a thread needs moderation, please contact the [Mercury moderation team](mailto:mercury@postlight.com). +feel that a thread needs moderation, please contact the [Postlight Parser moderation team](mailto:mercury@postlight.com). -1. Remarks that violate the Mercury standards of conduct, including hateful, hurtful, +1. Remarks that violate the Postlight Parser standards of conduct, including hateful, hurtful, oppressive, or exclusionary remarks, are not allowed. (Cursing is allowed, but never targeting another user, and never in a hateful manner.) @@ -69,7 +69,7 @@ feel that a thread needs moderation, please contact the [Mercury moderation team moderator creates an inappropriate situation, they should expect less leeway than others. -In the Mercury community we strive to go the extra step to look out for each other. +In the Postlight Parser community we strive to go the extra step to look out for each other. Don't just aim to be technically unimpeachable, try to be your best self. In particular, avoid flirting with offensive or sensitive issues, particularly if they're off-topic; this all too often leads to unnecessary fights, hurt feelings, @@ -84,9 +84,9 @@ are all here first and foremost because we want to talk about cool technology. You will find that people will be eager to assume good intent and forgive as long as you earn their trust. -The enforcement policies listed above apply to all official Mercury venues; including GitHub -repositories under postlight/mercury such as mercury-rs and other Postlight repositories -with a \*-mercury or mercury-\* naming convention; For other projects adopting the Mercury +The enforcement policies listed above apply to all official Postlight Parser venues; including GitHub +repositories under postlight/parser such as mercury-rs and other Postlight repositories +with a \*-parser or parser-\* naming convention; For other projects adopting the Postlight Parser Code of Conduct, please contact the maintainers of those projects for enforcement. If you wish to use this code of conduct for your own project, consider explicitly mentioning your moderation policy or making a copy with your own moderation policy diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a797407b..f3422693 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,12 +1,12 @@ -# Contributing to Mercury Parser +# Contributing to Postlight Parser -Thank you for your interest in contributing to Mercury Parser! It's people like you that make Mercury such a useful tool. The below guidelines will help answer any questions you may have about the contribution process. We look forward to receiving contributions from you — our community! +Thank you for your interest in contributing to Postlight Parser! It's people like you that make this such a useful tool. The below guidelines will help answer any questions you may have about the contribution process. We look forward to receiving contributions from you — our community! _Please read our [Code of Conduct](./CODE_OF_CONDUCT.md) before participating._ ## Contents -- [Contributing to Mercury Parser](#contributing-to-mercury-parser) +- [Contributing to Postlight Parser](#contributing-to-postlight-parser) - [Contents](#contents) - [Ways to Contribute](#ways-to-contribute) - [Reporting a Bug](#reporting-a-bug) @@ -27,15 +27,15 @@ _Please read our [Code of Conduct](./CODE_OF_CONDUCT.md) before participating._ ## Ways to Contribute -There are many ways you can contribute to the Mercury community. We value each type +There are many ways you can contribute to the Postlight Parser community. We value each type of contribution and appreciate your help. Here are a few examples of what we consider a contribution: - Updates to source code, including bug fixes, improvements, or [creating new custom site extractors](./src/extractors/custom/README.md) - Answering questions and chatting with the community in the [Gitter](https://gitter.im/postlight/mercury) room -- Filing, organizing, and commenting on issues in the [issue tracker](https://github.com/postlight/mercury-parser/issues) -- Teaching others how to use Mercury +- Filing, organizing, and commenting on issues in the [issue tracker](https://github.com/postlight/parser/issues) +- Teaching others how to use Postlight Parser - Community building and outreach ## Reporting a Bug @@ -49,41 +49,41 @@ as it's possible that someone else has already reported the error. This doesn't always work, and sometimes it's hard to know what to search for, so consider this extra credit. We won't mind if you accidentally file a duplicate report. -Opening an issue is as easy as following [this link](https://github.com/postlight/mercury-parser/issues/new) +Opening an issue is as easy as following [this link](https://github.com/postlight/parser/issues/new) and filling out the template. ### Security -If you find a security bug in Mercury, send an email with a descriptive subject line +If you find a security bug in Postlight Parser, send an email with a descriptive subject line to [mercury+security@postlight.com](mailto:mercury+security@postlight.com). If you think -you’ve found a serious vulnerability, please do not file a public issue or share in the Mercury Gitter room. +you’ve found a serious vulnerability, please do not file a public issue or share in the Postlight Parser Gitter room. -Your report will go to Mercury's core development team. You will receive +Your report will go to Postlight Parser's core development team. You will receive acknowledgement of the report in 24-48 hours, and our next steps should be to release a fix. If you don’t get a report acknowledgement in 48 hours, send an email to [mercury@postlight.com](mailto:mercury@postlight.com). A working list of public, known security-related issues can be found in the -[issue tracker](https://github.com/postlight/mercury-parser/issues?q=is%3Aopen+is%3Aissue+label%3Asecurity). +[issue tracker](https://github.com/postlight/parser/issues?q=is%3Aopen+is%3Aissue+label%3Asecurity). ## Requesting a Feature -To request a change to the way that Mercury works, please open an issue in this repository named, "Feature Request: [Your Feature Idea]," followed by your suggestion. +To request a change to the way that Postlight Parser works, please open an issue in this repository named, "Feature Request: [Your Feature Idea]," followed by your suggestion. ## Development Workflow -This section of the document outlines how to build, run, and test Mercury locally. +This section of the document outlines how to build, run, and test Postlight Parser locally. ### Building -To build the Mercury Parser locally, execute the following commands: +To build the Postlight Parser locally, execute the following commands: ```bash # Clone this repository from GitHub. -git clone https://github.com/postlight/mercury-parser.git +git clone https://github.com/postlight/parser.git # Navigate into the root of this repository. -cd mercury-parser +cd parser # Install local dependencies. yarn install @@ -97,7 +97,7 @@ yarn build:web ### Testing -Mercury is a test-driven application; each component has its own test file. Tests are run for both node and web builds. Our testing frameworks are: +Postlight Parser is a test-driven application; each component has its own test file. Tests are run for both node and web builds. Our testing frameworks are: - `Jest` for the node build - `Karma` for the web build @@ -143,9 +143,9 @@ preset. This helps keep our Markdown tidy and consistent. ### Node.js Version Requirements -Mercury is built against Node `>= v12.8.1`. Since this is the +Postlight Parser is built against Node `>= v12.8.1`. Since this is the version we run in our CI environments, we recommend you use it when working on -the Mercury codebase. +the codebase. If you use [nvm](https://github.com/creationix/nvm) to manage Node.js versions and zsh (like [Oh-My-ZSH](https://github.com/robbyrussell/oh-my-zsh)), you can @@ -176,12 +176,12 @@ load-nvmrc ## Writing Documentation -Improvements to documentation are a great way to start contributing to Mercury. The +Improvements to documentation are a great way to start contributing to Postlight Parser. The source for the official documentation are Markdown files that live in this repository. ## Submitting a Pull Request -Want to make a change to Mercury? Submit a pull request! We use the "fork and pull" +Want to make a change to Postlight Parser? Submit a pull request! We use the "fork and pull" model [described here](https://help.github.com/articles/creating-a-pull-request-from-a-fork). **Before submitting a pull request**, please make sure: @@ -203,7 +203,7 @@ Commit messages should follow the format outlined below: | chore | does not effect the production version of the app in any way. | | deps | add, update, or remove a dependency. | | doc | add, update, or remove documentation. no code changes. | -| dx | improve the development experience of mercury core. | +| dx | improve the development experience of parser core. | | feat | a feature or enhancement. can be incredibly small. | | fix | a bug fix for something that was broken. | | perf | add, update, or fix a test. | @@ -222,9 +222,9 @@ fall behind. Feel free to reach out to the core team if you have not received a Some useful places to look for information are: - The main [README](./README.md) for this repository. -- The Mercury Custom Parser [README](./src/extractors/custom/README.md). +- The Postlight Custom Parser [README](./src/extractors/custom/README.md). - The postlight/mercury room on [Gitter](https://gitter.im/postlight/mercury) -- The Mercury Parser API [repository](https://github.com/postlight/mercury-parser-api). +- The Postlight Parser API [repository](https://github.com/postlight/parser-api). _Adapted from [Contributing to Node.js](https://github.com/nodejs/node/blob/master/CONTRIBUTING.md) and [ThinkUp Security and Data Privacy](http://thinkup.readthedocs.io/en/latest/install/security.html#thinkup-security-and-data-privacy)._ diff --git a/README.md b/README.md index 379633da..03b57fa9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -![Mercury Parser](https://13c27d41k2ud2vkddp226w55-wpengine.netdna-ssl.com/wp-content/uploads/2018/02/7bacd-16qwcaegges3hkrw70doz4w.png) +![Postlight Parser](https://13c27d41k2ud2vkddp226w55-wpengine.netdna-ssl.com/wp-content/uploads/2018/02/7bacd-16qwcaegges3hkrw70doz4w.png) -# Mercury Parser - Extracting content from chaos +# Postlight Parser - Extracting content from chaos [![CircleCI](https://circleci.com/gh/postlight/mercury-parser.svg?style=svg&circle-token=3026c2b527d3767750e767872d08991aeb4f8f10)](https://circleci.com/gh/postlight/mercury-parser) [![Greenkeeper badge](https://badges.greenkeeper.io/postlight/mercury-parser.svg)](https://greenkeeper.io/) [![Apache License][license-apach-badge]][license-apach] [![MITC License][license-mit-badge]][license-mit] [![Gitter chat](https://badges.gitter.im/postlight/mercury.png)](https://gitter.im/postlight/mercury) @@ -10,11 +10,11 @@ [license-mit-badge]: https://img.shields.io/badge/License-MIT%202.0-blue.svg?style=flat-square [license-mit]: https://github.com/postlight/mercury-parser/blob/master/LICENSE-MIT -[Postlight](https://postlight.com)'s Mercury Parser extracts the bits that humans care about from any URL you give it. That includes article content, titles, authors, published dates, excerpts, lead images, and more. +[Postlight](https://postlight.com)'s Parser extracts the bits that humans care about from any URL you give it. That includes article content, titles, authors, published dates, excerpts, lead images, and more. -Mercury Parser powers the [Mercury AMP Converter](https://mercury.postlight.com/amp-converter/) and [Mercury Reader](https://mercury.postlight.com/reader/), a Chrome extension that removes ads and distractions, leaving only text and images for a beautiful reading view on any site. +Postlight Parser powers [Postlight Reader](https://reader.postlight.com/), a browser extension that removes ads and distractions, leaving only text and images for a beautiful reading view on any site. -Mercury Parser allows you to easily create custom parsers using simple JavaScript and CSS selectors. This allows you to proactively manage parsing and migration edge cases. There are [many examples available](https://github.com/postlight/mercury-parser/tree/master/src/extractors/custom) along with [documentation](https://github.com/postlight/mercury-parser/blob/master/src/extractors/custom/README.md). +Postlight Parser allows you to easily create custom parsers using simple JavaScript and CSS selectors. This allows you to proactively manage parsing and migration edge cases. There are [many examples available](https://github.com/postlight/parser/tree/master/src/extractors/custom) along with [documentation](https://github.com/postlight/parser/blob/master/src/extractors/custom/README.md). ## How? Like this. @@ -22,21 +22,21 @@ Mercury Parser allows you to easily create custom parsers using simple JavaScrip ```bash # If you're using yarn -yarn add @postlight/mercury-parser +yarn add @postlight/parser # If you're using npm -npm install @postlight/mercury-parser +npm install @postlight/parser ``` ### Usage ```javascript -import Mercury from '@postlight/mercury-parser'; +import Parser from '@postlight/parser'; -Mercury.parse(url).then(result => console.log(result)); +Parser.parse(url).then(result => console.log(result)); // NOTE: When used in the browser, you can omit the URL argument -// and simply run `Mercury.parse()` to parse the current page. +// and simply run `Parser.parse()` to parse the current page. ``` The result looks like this: @@ -60,16 +60,16 @@ The result looks like this: } ``` -If Mercury is unable to find a field, that field will return `null`. +If Parser is unable to find a field, that field will return `null`. #### `parse()` Options ##### Content Formats -By default, Mercury Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example: +By default, Postlight Parser returns the `content` field as HTML. However, you can override this behavior by passing in options to the `parse` function, specifying whether or not to scrape all pages of an article, and what type of output to return (valid values are `'html'`, `'markdown'`, and `'text'`). For example: ```javascript -Mercury.parse(url, { contentType: 'markdown' }).then(result => +Parser.parse(url, { contentType: 'markdown' }).then(result => console.log(result) ); ``` @@ -85,7 +85,7 @@ This returns the the page's `content` as GitHub-flavored Markdown: You can include custom headers in requests by passing name-value pairs to the `parse` function as follows: ```javascript -Mercury.parse(url, { +Parser.parse(url, { headers: { Cookie: 'name=value; name2=value2; name3=value3', 'User-Agent': @@ -96,10 +96,10 @@ Mercury.parse(url, { ##### Pre-fetched HTML -You can use Mercury Parser to parse custom or pre-fetched HTML by passing an HTML string to the `parse` function as follows: +You can use Postlight Parser to parse custom or pre-fetched HTML by passing an HTML string to the `parse` function as follows: ```javascript -Mercury.parse(url, { +Parser.parse(url, { html: '

Thunder (mascot)

Thunder is the stage name for the horse who is the official live animal mascot for the Denver Broncos

', }).then(result => console.log(result)); @@ -109,37 +109,36 @@ Note that the URL argument is still supplied, in order to identify the web site #### The command-line parser -Mercury Parser also ships with a CLI, meaning you can use the Mercury Parser -from your command line like so: +Postlight Parser also ships with a CLI, meaning you can use it from your command line like so: -![Mercury Parser CLI Basic Usage](./assets/mercury-basic-usage.gif) +![Postlight Parser CLI Basic Usage](./assets/parser-basic-usage.gif) ```bash -# Install Mercury globally -yarn global add @postlight/mercury-parser +# Install Postlight Parser globally +yarn global add @postlight/parser # or -npm -g install @postlight/mercury-parser +npm -g install @postlight/parser # Then -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source # Pass optional --format argument to set content type (html|markdown|text) -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --format=markdown # Pass optional --header.name=value arguments to include custom headers in the request -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --header.Cookie="name=value; name2=value2; name3=value3" --header.User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1" +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --header.Cookie="name=value; name2=value2; name3=value3" --header.User-Agent="Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1" # Pass optional --extend argument to add a custom type to the response -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em" +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend credit="p:last-child em" # Pass optional --extend-list argument to add a custom type with multiple matches -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list categories=".meta__tags-list a" +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list categories=".meta__tags-list a" # Get the value of attributes by adding a pipe to --extend or --extend-list -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href" +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --extend-list links=".body a|href" # Pass optional --add-extractor argument to add a custom extractor at runtime. -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js ``` ## License @@ -153,7 +152,7 @@ Licensed under either of the below, at your preference: ## Contributing -For details on how to contribute to Mercury, including how to write a custom content extractor for any site, see [CONTRIBUTING.md](./CONTRIBUTING.md) +For details on how to contribute to Postlight Parser, including how to write a custom content extractor for any site, see [CONTRIBUTING.md](./CONTRIBUTING.md) Unless it is explicitly stated otherwise, any contribution intentionally submitted for inclusion in the work, as defined in the Apache-2.0 license, shall be dual licensed as above without any additional terms or conditions. diff --git a/RELEASE.md b/RELEASE.md index b5220e7c..e263a515 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,22 +1,26 @@ # How to cut a new release 1. Create a release branch. - ```bash - git checkout -b release-1.x.x # (where 1.x.x reflects the release) - ``` + +```bash +git checkout -b release-1.x.x # (where 1.x.x reflects the release) +``` + 2. Update package.json with the version number 3. Build the release - ```bash - yarn release - ``` + +```bash +yarn release +``` + 4. Update the changelog - ```bash - # Copy the output of the command below and paste it into CHANGELOG.md - # following the conventions of that file - yarn changelog-maker postlight mercury-parser - ``` +```bash +# Copy the output of the command below and paste it into CHANGELOG.md +# following the conventions of that file +yarn changelog-maker postlight parser +``` + 5. Submit a PR 6. Merge once the PR's tests pass -7. [Create a release](https://github.com/postlight/mercury-parser/releases), linking to this release's entry in the changelog. (See other releases for context.) - +7. [Create a release](https://github.com/postlight/parser/releases), linking to this release's entry in the changelog. (See other releases for context.) diff --git a/assets/mercury-basic-usage.gif b/assets/parser-basic-usage.gif similarity index 100% rename from assets/mercury-basic-usage.gif rename to assets/parser-basic-usage.gif diff --git a/cli.js b/cli.js index 10f5adec..d0787e0c 100755 --- a/cli.js +++ b/cli.js @@ -1,7 +1,7 @@ #!/usr/bin/env node /* eslint-disable */ -const Mercury = require('./dist/mercury'); +const Parser = require('./dist/mercury'); const package_info = require('./package.json'); const argv = require('yargs-parser')(process.argv.slice(2)); @@ -36,11 +36,11 @@ const { if (!urlToParse) { console.log( '\n\ -mercury-parser\n\n\ - The Mercury Parser extracts semantic content from any url\n\n\ +postlight-parser\n\n\ + The Postlight Parser extracts semantic content from any url\n\n\ Usage:\n\ \n\ - $ mercury-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\ + $ postlight-parser url-to-parse [--format=html|text|markdown] [--header.name=value]... [--extend type=selector]... [--extend-list type=selector]... [--add-extractor path_to_extractor.js]... \n\ \n\ ' ); @@ -78,7 +78,7 @@ Usage:\n\ customExtractor = require(addExtractor); } - const result = await Mercury.parse(urlToParse, { + const result = await Parser.parse(urlToParse, { contentType: contentTypeMap[contentType], extend: extensions, headers, @@ -88,16 +88,16 @@ Usage:\n\ } catch (e) { if (e.message === 'ETIMEDOUT' && false) { console.error( - '\nMercury Parser encountered a timeout trying to load that resource.' + '\nPostlight Parser encountered a timeout trying to load that resource.' ); } else { console.error( - '\nMercury Parser encountered a problem trying to parse that resource.\n' + '\nPostlight Parser encountered a problem trying to parse that resource.\n' ); console.error(e); } const reportBug = - 'If you believe this was an error, please file an issue at:\n\n https://github.com/postlight/mercury-parser/issues/new'; + 'If you believe this was an error, please file an issue at:\n\n https://github.com/postlight/parser/issues/new'; console.error(`\n${reportBug}\n`); process.exit(1); } diff --git a/package.json b/package.json index 400c115b..5bb93491 100644 --- a/package.json +++ b/package.json @@ -1,16 +1,16 @@ { - "name": "@postlight/mercury-parser", + "name": "@postlight/parser", "version": "2.2.1", - "description": "Mercury transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.", + "description": "Postlight Parser transforms web pages into clean text. Publishers and programmers use it to make the web make sense, and readers use it to read any web article comfortably.", "author": "Postlight ", - "homepage": "https://mercury.postlight.com", + "homepage": "https://reader.postlight.com", "license": "MIT", "repository": { "type": "git", - "url": "git+https://github.com/postlight/mercury-parser.git" + "url": "git+https://github.com/postlight/parser.git" }, "bugs": { - "url": "https://github.com/postlight/mercury-parser/issues" + "url": "https://github.com/postlight/parser/issues" }, "keywords": [ "mercury", @@ -26,7 +26,8 @@ ], "main": "./dist/mercury.js", "bin": { - "mercury-parser": "./cli.js" + "mercury-parser": "./cli.js", + "postlight-parser": "./cli.js" }, "scripts": { "lint": "eslint . --fix", diff --git a/scripts/comment-for-pr.js b/scripts/comment-for-pr.js index 45cb6267..f8bfdb9f 100644 --- a/scripts/comment-for-pr.js +++ b/scripts/comment-for-pr.js @@ -1,6 +1,6 @@ /* eslint-disable */ const bot = require('@jesses/circle-github-bot').default.create(); -const Mercury = require('../dist/mercury.js'); +const Parser = require('../dist/mercury.js'); const fs = require('fs'); const execSync = require('child_process').execSync; const { getReport } = require('@postlight/ci-failed-test-reporter'); @@ -13,7 +13,7 @@ const run = () => { const html = fs.readFileSync(`${fixture}`); // first parse is just to get the url - Mercury.parse('http://example.com', { html, fallback: false }).then( + Parser.parse('http://example.com', { html, fallback: false }).then( ({ url, domain, excerpt, word_count, direction }) => { // with the url, second pass will test the correct parser Mercury.parse(url, { html, fallback: false }).then(json => { diff --git a/scripts/generate-custom-parser.js b/scripts/generate-custom-parser.js index bbb955fb..be5e4c8a 100755 --- a/scripts/generate-custom-parser.js +++ b/scripts/generate-custom-parser.js @@ -8,7 +8,7 @@ import ora from 'ora'; import { exec } from 'child_process'; import { stripJunkTags, makeLinksAbsolute } from 'utils/dom'; -import Mercury from '../dist/mercury'; +import Parser from '../dist/mercury'; import extractorTemplate from './templates/custom-extractor'; import extractorTestTemplate from './templates/custom-extractor-test'; @@ -64,7 +64,7 @@ function scaffoldCustomParser(url) { confirmCreateDir(`./fixtures/${hostname}`, 'Creating fixtures directory'); } - confirm(Mercury.fetchResource, [url], 'Fetching fixture', newParser); + confirm(Parser.fetchResource, [url], 'Fetching fixture', newParser); } // if has arg, just assume that arg is a url and skip prmopt @@ -114,7 +114,7 @@ function savePage($, [url], newParser) { fs.writeFileSync(file, html); - Mercury.parse(url, { html }).then(result => { + Parser.parse(url, { html }).then(result => { if (newParser) { confirm( generateScaffold, diff --git a/scripts/templates/custom-extractor-test.js b/scripts/templates/custom-extractor-test.js index cbb2862d..10351ffe 100644 --- a/scripts/templates/custom-extractor-test.js +++ b/scripts/templates/custom-extractor-test.js @@ -34,7 +34,7 @@ export default function(file, url, dir, result, name) { import URL from 'url'; import cheerio from 'cheerio'; - import Mercury from 'mercury'; + import Parser from 'mercury'; import getExtractor from 'extractors/get-extractor'; import { excerptContent } from 'utils/text'; @@ -50,7 +50,7 @@ export default function(file, url, dir, result, name) { const html = fs.readFileSync('${file}'); result = - Mercury.parse(url, { html, fallback: false }); + Parser.parse(url, { html, fallback: false }); }); it('is selected properly', () => { diff --git a/scripts/update-fixtures.js b/scripts/update-fixtures.js index ce33e00a..25641dd3 100644 --- a/scripts/update-fixtures.js +++ b/scripts/update-fixtures.js @@ -6,7 +6,7 @@ const path = require('path'); const URL = require('url'); const octokit = require('@octokit/rest')(); -const Mercury = require('../dist/mercury'); +const Parser = require('../dist/mercury'); // get all fixtures execFile('find', ['fixtures', '-type', 'f'], (err, stdout) => { @@ -40,7 +40,7 @@ execFile('find', ['fixtures', '-type', 'f'], (err, stdout) => { Promise.all( fixturesToUpdate.map((fixture, i) => { const html = fs.readFileSync(fixture); - return Mercury.parse(`http://${baseDomains[i]}`, { html }); + return Parser.parse(`http://${baseDomains[i]}`, { html }); }) ).then(parsedFixture => { const fixturesAndUrls = fixturesToUpdate.reduce( @@ -76,7 +76,7 @@ const changeBase = []; const otherMess = []; const updateFixture = ({ fixture, url, baseDomain }) => { return new Promise(res => { - Mercury.parse(url) + Parser.parse(url) .then(({ url: updatedUrl }) => { if (!updatedUrl) { otherMess.push({ updatedUrl, url, fixture, baseDomain }); @@ -162,9 +162,7 @@ const createAndPushBranch = ({ branchName, commitMessage }) => { execFileSync('git', [ 'push', '-q', - `https://${ - process.env.GH_AUTH_TOKEN - }@github.com/postlight/mercury-parser.git`, + `https://${process.env.GH_AUTH_TOKEN}@github.com/postlight/parser.git`, ]); }; @@ -176,7 +174,7 @@ const createPR = ({ branchName, title, body = '' }) => { octokit.pulls.create({ owner: 'postlight', - repo: 'mercury-parser', + repo: 'parser', title, head: branchName, base: 'master', diff --git a/src/extractors/custom/README.md b/src/extractors/custom/README.md index 04ea013c..42867d14 100644 --- a/src/extractors/custom/README.md +++ b/src/extractors/custom/README.md @@ -1,12 +1,12 @@ # Custom Parsers -Mercury can extract meaningful content from almost any web site, but custom parsers/extractors allow the Mercury Parser to find the content more quickly and more accurately than it might otherwise do. Our goal is to include custom parsers as many sites as we can, and we'd love your help! +Postlight Parser can extract meaningful content from almost any web site, but custom parsers/extractors allow the Postlight Parser to find the content more quickly and more accurately than it might otherwise do. Our goal is to include custom parsers as many sites as we can, and we'd love your help! ## The basics of parsing a site with a custom parser Custom parsers allow you to write CSS selectors that will find the content you're looking for on the page you're testing against. If you've written any CSS or jQuery, CSS selectors should be very familiar to you. -You can query for every field returned by the Mercury Parser: +You can query for every field returned by the Postlight Parser: - `title` - `author` @@ -39,11 +39,11 @@ export const ExampleExtractor = { ... ``` -As you might guess, the selectors key provides an array of selectors that Mercury will check to find your title text. In our `ExampleExtractor`, we're saying that the title can be found in the text of an `h1` header with a class name of `hed`. +As you might guess, the selectors key provides an array of selectors that Postlight Parser will check to find your title text. In our `ExampleExtractor`, we're saying that the title can be found in the text of an `h1` header with a class name of `hed`. -The selector you choose should return one element. If more than one element is returned by your selector, it will fail (and Mercury will fall back to its generic extractor). +The selector you choose should return one element. If more than one element is returned by your selector, it will fail (and Parser will fall back to its generic extractor). -Because the `selectors` property returns an array, you can write more than one selector for a property extractor. This is particularly useful for sites that have multiple templates for articles. If you provide an array of selectors, Mercury will try each in order, falling back to the next until it finds a match or exhausts the options (in which case it will fall back to its default generic extractor). +Because the `selectors` property returns an array, you can write more than one selector for a property extractor. This is particularly useful for sites that have multiple templates for articles. If you provide an array of selectors, Parser will try each in order, falling back to the next until it finds a match or exhausts the options (in which case it will fall back to its default generic extractor). #### Selecting an attribute @@ -71,7 +71,7 @@ export const ExampleExtractor = { ... ``` -This is all you'll need to know to handle most of the fields Mercury parses (titles, authors, date published, etc.). Article content is the exception. +This is all you'll need to know to handle most of the fields Parser parses (titles, authors, date published, etc.). Article content is the exception. #### Content selectors @@ -99,7 +99,7 @@ export const ExampleExtractor = { To add a custom key to the response, add an `extend` object. The response will include results for each key of this object (`categories` in the example below). Setting -`allowMultiple` to `true` means Mercury will find all the content that matches the +`allowMultiple` to `true` means Parser will find all the content that matches the selectors, and will always return an array of results for that key. ```javascript @@ -195,12 +195,12 @@ Now that you know the basics of how custom extractors work, let's walk through t ### Step 0: Installation -First, you'll need to clone the Mercury Parser repository and install dependencies. +First, you'll need to clone the Postlight Parser repository and install dependencies. ```bash -git clone git@github.com:postlight/mercury-parser.git +git clone git@github.com:postlight/parser.git -cd mercury-parser +cd parser yarn install ``` @@ -255,7 +255,7 @@ it('returns the title', async () => { const articleUrl = 'http://www.newyorker.com/tech/elements/hacking-cryptography-and-the-countdown-to-quantum-computing'; - const { title } = await Mercury.parse(articleUrl, { html, fallback: false }); + const { title } = await Parser.parse(articleUrl, { html, fallback: false }); // Update these values with the expected values from // the article. @@ -265,7 +265,7 @@ it('returns the title', async () => { As you can see, to pass this test, we need to fill out our title selector. In order to do this, you need to know what your selector is. To do this, open the html fixture the generator downloaded for you in the [`fixtures`](/fixtures) directory. In our example, that file is `fixtures/www.newyorker.com/1475248565793.html`. Now open that file in your web browser. -The page should look more or less exactly like the site you pointed it to, but this version is downloaded locally for test purposes. (You should always look for selectors using this local fixture rather than the actual web site; some sites re-write elements after the page loads, and we want to make sure we're looking at the page the same way Mercury will be.) +The page should look more or less exactly like the site you pointed it to, but this version is downloaded locally for test purposes. (You should always look for selectors using this local fixture rather than the actual web site; some sites re-write elements after the page loads, and we want to make sure we're looking at the page the same way Postlight Parser will be.) (For the purpose of this guide, we're going to assume you're using Chrome as your default browser; any browser should do, but we're going to refer specifically to Chrome's developer tools in this guide.) @@ -302,7 +302,7 @@ AssertionError: 'Hacking, Cryptography, and the Countdown to Quantum Computing' 'Schrödinger’s Hack'; ``` -When Mercury generated our test, it took a guess at the page's title, and in this case, it got it wrong. So update the test with the title we expect, save it, and your test should pass! +When Parser generated our test, it took a guess at the page's title, and in this case, it got it wrong. So update the test with the title we expect, save it, and your test should pass! ### Step 3: Speed it up @@ -370,7 +370,7 @@ const customExtractor = { }, }; -Mercury.addExtractor(customExtractor); +Parser.addExtractor(customExtractor); ``` --- @@ -406,5 +406,5 @@ module.exports = customExtractor; ### 2. From the CLI, add the `--add-extractor` param: ```bash -mercury-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js +postlight-parser https://postlight.com/trackchanges/mercury-goes-open-source --add-extractor ./src/extractors/fixtures/postlight.com/index.js ``` diff --git a/src/mercury.js b/src/mercury.js index 8a3bbb5e..4a742d0c 100644 --- a/src/mercury.js +++ b/src/mercury.js @@ -9,7 +9,7 @@ import getExtractor from 'extractors/get-extractor'; import RootExtractor, { selectExtendedTypes } from 'extractors/root-extractor'; import collectAllPages from 'extractors/collect-all-pages'; -const Mercury = { +const Parser = { async parse(url, { html, ...opts } = {}) { const { fetchAllPages = true, @@ -53,7 +53,7 @@ const Mercury = { const Extractor = getExtractor(url, parsedUrl, $); // console.log(`Using extractor for ${Extractor.domain}`); - // if html still has not been set (i.e., url passed to Mercury.parse), + // if html still has not been set (i.e., url passed to Parser.parse), // set html from the response of Resource.create if (!html) { html = $.html(); @@ -125,4 +125,4 @@ const Mercury = { }, }; -export default Mercury; +export default Parser; diff --git a/src/mercury.test.js b/src/mercury.test.js index aac2b9fe..3d83130c 100644 --- a/src/mercury.test.js +++ b/src/mercury.test.js @@ -1,23 +1,23 @@ import assert from 'assert'; import { record } from 'test-helpers'; -import Mercury from './mercury'; +import Parser from './mercury'; const fs = require('fs'); -describe('Mercury', () => { +describe('Parser', () => { const recorder = record('mercury-test'); beforeAll(recorder.before); afterAll(recorder.after); describe('parse(url)', () => { it('returns an error if a malformed url is passed', async () => { - const error = await Mercury.parse('foo.com'); + const error = await Parser.parse('foo.com'); assert(/does not look like a valid URL/i.test(error.message)); }); it('does the whole thing', async () => { - const result = await Mercury.parse( + const result = await Parser.parse( 'http://deadspin.com/remember-when-donald-trump-got-booed-for-butchering-ta-1788216229' ); @@ -26,7 +26,7 @@ describe('Mercury', () => { }); it('returns an error on non-200 responses', async () => { - const error = await Mercury.parse( + const error = await Parser.parse( 'https://www.thekitchn.com/instant-pot-chicken-pesto-pasta-eating-instantly-267141' ); @@ -34,7 +34,7 @@ describe('Mercury', () => { }); it('returns an error on invalid content types', async () => { - const error = await Mercury.parse( + const error = await Parser.parse( 'https://upload.wikimedia.org/wikipedia/commons/5/52/Spacer.gif' ); @@ -42,7 +42,7 @@ describe('Mercury', () => { }); it('does wikipedia', async () => { - const result = await Mercury.parse( + const result = await Parser.parse( 'https://en.wikipedia.org/wiki/Brihadeeswarar_Temple_fire' ); @@ -51,7 +51,7 @@ describe('Mercury', () => { it('does washingtonpost', async () => { jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000; - const result = await Mercury.parse( + const result = await Parser.parse( 'https://www.washingtonpost.com/news/opinions/wp/2018/10/29/enough-platitudes-lets-name-names/' ); @@ -64,7 +64,7 @@ describe('Mercury', () => { }); it('does the nyt', async () => { - const result = await Mercury.parse( + const result = await Parser.parse( 'http://www.nytimes.com/2016/08/16/upshot/the-state-of-the-clinton-trump-race-is-it-over.html?_r=0' ); @@ -76,7 +76,7 @@ describe('Mercury', () => { jasmine.DEFAULT_TIMEOUT_INTERVAL = 10000; const url = 'https://arstechnica.com/gadgets/2016/08/the-connected-renter-how-to-make-your-apartment-smarter/'; - const result = await Mercury.parse(url, { fetchAllPages: true }); + const result = await Parser.parse(url, { fetchAllPages: true }); const { total_pages, pages_rendered } = result; @@ -94,7 +94,7 @@ describe('Mercury', () => { './src/extractors/custom/nymag.com/fixtures/test.html', 'utf8' ); - const { content } = await Mercury.parse(url, { html, contentType: 'text' }); + const { content } = await Parser.parse(url, { html, contentType: 'text' }); const htmlRe = /<[a-z][\s\S]*>/g; @@ -108,7 +108,7 @@ describe('Mercury', () => { './src/extractors/custom/nymag.com/fixtures/test.html', 'utf8' ); - const { content } = await Mercury.parse(url, { + const { content } = await Parser.parse(url, { html, contentType: 'markdown', }); @@ -127,7 +127,7 @@ describe('Mercury', () => { './src/extractors/custom/nymag.com/fixtures/test.html', 'utf8' ); - const { sites } = await Mercury.parse(url, { + const { sites } = await Parser.parse(url, { html, extend: { sites: { @@ -148,7 +148,7 @@ describe('Mercury', () => { './src/extractors/custom/nymag.com/fixtures/test.html', 'utf8' ); - const { sites } = await Mercury.parse(url, { + const { sites } = await Parser.parse(url, { html, extend: { sites: { @@ -168,7 +168,7 @@ describe('Mercury', () => { './src/extractors/custom/nymag.com/fixtures/test.html', 'utf8' ); - const { sites } = await Mercury.parse(url, { + const { sites } = await Parser.parse(url, { html, extend: { sites: { @@ -208,9 +208,9 @@ describe('Mercury', () => { }, }; - Mercury.addExtractor(customExtractor); + Parser.addExtractor(customExtractor); - const result = await Mercury.parse(url, { html }); + const result = await Parser.parse(url, { html }); assert.equal(typeof result, 'object'); assert.equal(result.author, 'Jennifer Van Grove'); assert.equal(result.domain, 'www.sandiegouniontribune.com');