From 89b1c8ebf81d58dfa922120c808d3878b43a6888 Mon Sep 17 00:00:00 2001 From: gardenapple Date: Fri, 15 Oct 2021 19:05:57 +0300 Subject: [PATCH] Add man page, sourced from Markdown --- package.json | 9 +- readable.1 | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++ readable.1.md | 148 +++++++++++++++++++++++++++++++ 3 files changed, 393 insertions(+), 2 deletions(-) create mode 100644 readable.1 create mode 100644 readable.1.md diff --git a/package.json b/package.json index cd959f5..e778591 100644 --- a/package.json +++ b/package.json @@ -7,8 +7,10 @@ "readable": "./index.js" }, "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" + "test": "echo \"Error: no test specified\" && exit 1", + "man": "marked-man --version=\"$(node -p 'require(\"./package.json\").version')\" readable.1.md > readable.1" }, + "man": "./readable.1", "repository": "gitlab:gardenappl/readability-cli", "author": "gardenapple ", "bugs": "https://gitlab.com/gardenappl/readability-cli/-/issues", @@ -25,9 +27,12 @@ "license": "GPL-3.0-only", "dependencies": { "@mozilla/readability": "^0.4.1", - "dompurify": "^2.2.7", + "dompurify": "^2.3.3", "jsdom": "^16.5.1", "y18n": "^5.0.5", "yargs": "^17.0.0" + }, + "devDependencies": { + "marked-man": "^0.7.0" } } diff --git a/readable.1 b/readable.1 new file mode 100644 index 0000000..9e06dd2 --- /dev/null +++ b/readable.1 @@ -0,0 +1,238 @@ +.TH "READABILITY\-CLI" "1" "October 2021" "2.3.0" "" +.SH "NAME" +\fBreadability-cli\fR \- get useful text from a web page +.SH SYNOPSYS +.P +\fBreadable\fR \fI[SOURCE]\fR \fI[options]\.\.\.\fR +.SH DESCRIPTION +.P +\fBreadability\-cli\fR takes any HTML page and strips out unnecessary bloat, leaving only the core text content\. The resulting HTML may be suitable for terminal browsers, text readers, and other uses\. +.P +This package provides the \fBreadable\fR command, which uses Mozilla's Readability library\. The same library is used in Firefox's Reader View\. +.SH OPTIONS +.P +The \fISOURCE\fR can be a URL, a file, or '\-' for standard input\. +.P +\fB\-\-help\fP +.RS 0 +.IP \(bu 2 +Show help message, and exit\. + +.RE +.P +\fB\-b\fP, \fB\-\-base\fP \fIURL\fR +.RS 0 +.IP \(bu 2 +Specify the document's URL\. This affects relative links: they will not work if \fBreadability\-cli\fR does not know the base URL\. You only need this option if you read HTML from a local file, or from standard input\. + +.RE +.P +\fB\-i\fP, \fB\-\-insane\fP +.RS 0 +.IP \(bu 2 +Don't sanitize HTML\. + +.RE +.P +\fB\-K\fP, \fB\-\-insecure\fP +.RS 0 +.IP \(bu 2 +Allow invalid SSL certificates\. + +.RE +.P +\fB\-j\fP, \fB\-\-json\fP +.RS 0 +.IP \(bu 2 +Output all known properties of the document as JSON (see \fBProperties\fR subsection)\. + +.RE +.P +\fB\-l\fP, \fB\-\-low\-confidence\fP \fIMODE\fR +.RS 0 +.IP \(bu 2 +What to do if Readability is uncertain about what the core content actually is\. The possible modes are: +.RS +.IP \(bu 2 +\fBkeep\fR \- When unsure, don't touch the HTML, output as\-is\. +.IP \(bu 2 +\fBforce\fR \- Process the document even when unsure (may produce really bad output)\. +.IP \(bu 2 +\fBexit\fR \- When unsure, exit with an error\. + +.RE +.IP \(bu 2 +The default value is \fBkeep\fR\|\. If the \fB\-\-properties\fP or \fB\-\-json\fP options are set, the program will always run in \fBexit\fR mode\. + +.RE +.P +\fB\-C\fP, \fB\-\-keep\-classes\fP +.RS 0 +.IP \(bu 2 +Preserve CSS classes for input elements\. By default, CSS classes are stripped, and the input is adapted for Firefox's Reader View\. + +.RE +.P +\fB\-o\fP, \fB\-\-output\fP \fIFILE\fR +.RS 0 +.IP \(bu 2 +Output the result to FILE\. + +.RE +.P +\fB\-p\fP, \fB\-\-properties\fP \fIPROPERTIES\fR\|\.\.\. +.RS 0 +.IP \(bu 2 +Output specific properties of the document (see \fBProperties\fR subsection)\. + +.RE +.P +\fB\-x\fP, \fB\-\-proxy\fP \fIURL\fR +.RS 0 +.IP \(bu 2 +Use specified proxy (can also use \fBHTTPS_PROXY\fP environment variable)\. + +.RE +.P +\fB\-q\fP, \fB\-\-quiet\fP +.RS 0 +.IP \(bu 2 +Don't print extra information\. + +.RE +.P +\fB\-s\fP, \fB\-\-style\fP +.RS 0 +.IP \(bu 2 +Specify \fI\|\.css\fR file for stylesheet\. + +.RE +.P +\fB\-A\fP, \fB\-\-user\-agent\fP \fISTRING\fR +.RS 0 +.IP \(bu 2 +Set custom user agent string\. + +.RE +.P +\fB\-V\fP, \fB\-\-version\fP +.RS 0 +.IP \(bu 2 +Print \fBreadability\-cli\fR and Node\.js version, then exit\. + +.RE +.P +\fB\-\-completion\fP +.RS 0 +.IP \(bu 2 +Print script for shell completion, and exit\. Provides Zsh completion if the current shell is zsh, otherwise provides Bash completion\. + +.RE +.SS Properties +.P +The \fB\-\-properties\fP option accepts a list of values, separated by spaces\. Suitable values are: +.RS 0 +.IP \(bu 2 +\fBtitle\fR \- The title of the article\. +.IP \(bu 2 +\fBhtml\-title\fR \- The title of the article, wrapped in an \fB

\fP tag\. +.IP \(bu 2 +\fBexcerpt\fR \- Article description, or short excerpt from the content\. +.IP \(bu 2 +\fBbyline\fR \- Data about the page's author\. +.IP \(bu 2 +\fBlength\fR \- Length of the article in characters\. +.IP \(bu 2 +\fBdir\fR \- Text direction, is either "ltr" for left\-to\-right or "rtl" for right\-to\-left\. +.IP \(bu 2 +\fBtext\-content\fR \- Output the article's main content as plain text\. +.IP \(bu 2 +\fBhtml\-content\fR \- Output the article's main content as an HTML body\. + +.RE +.P +Properties are printed line by line, in the order specified by the user\. Only "text\-content" and "html\-content" is printed as multiple lines\. +.SH EXIT STATUS +.P +As usual, exit code 0 indicates success, and anything other than 0 is an error\. \fBreadability\-cli\fR uses standard\fB*\fP error codes: +.TS +tab(|) expand nowarn box; + l l. +T{ +Error code +T}|T{ +Meaning +T} +_ +T{ +\fB64\fR +T}|T{ +Bad CLI arguments +T} +T{ +\fB65\fR +T}|T{ +Data format error: can't parse document using Readability\. +T} +T{ +\fB66\fR +T}|T{ +No input +T} +T{ +\fB68\fR +T}|T{ +Unknown host name for URL +T} +T{ +\fB77\fR +T}|T{ +Permission denied: can't read file +T} +.TE +.P +\fB*\fP By "standard error codes" I mean "close to a standard"\. And by that I mean: I actually don't remember any command line tools which use this convention\. You may find more info in \fBsysexits\fR(3), or maybe just \fIsysexits\.h\fR\|\. +.SH ENVIRONMENT +.P +\fBreadability\-cli\fR supports localization, using the environment variables \fBLC_ALL\fP, \fBLC_MESSAGES\fP, \fBLANG\fP and \fBLANGUAGE\fP, in that order\. Only one language at a time is supported\. +.P +\fBHTTPS_PROXY\fP will set the HTTPS proxy, as previously stated, however the \fB\-\-proxy\fP option overrides this\. Lowercase \fBhttps_proxy\fP and \fBhttp_proxy\fP are also recognized\. +.SH EXAMPLE +.P +\fBRead HTML from a file and output the result to the console:\fR +.P +.RS 2 +.nf +readable index\.html +.fi +.RE +.P +\fBFetch a random Wikipedia article, get its title and an excerpt:\fR +.P +.RS 2 +.nf +readable https://en\.wikipedia\.org/wiki/Special:Random \-p title,excerpt +.fi +.RE +.P +\fBFetch a web page and read it in W3M:\fR +.P +.RS 2 +.nf +readable https://www\.nytimes\.com/2020/01/18/technology/clearview\-privacy\-facial\-recognition\.html | w3m \-T text/html +.fi +.RE +.P +\fBDownload a web page using cURL, parse it and output as JSON:\fR +.P +.RS 2 +.nf +curl https://github\.com/mozilla/readability | readable \-\-base=https://github\.com/mozilla/readability \-\-json +.fi +.RE +.SH SEE ALSO +.P +\fBcurl\fR(1), \fBw3m\fR(1), \fBsysexits\fR(3) +.P +Source code, license, bug tracker and merge requests may be found on GitLab \fIhttps://gitlab\.com/gardenappl/readability\-cli\fR\|\. + diff --git a/readable.1.md b/readable.1.md new file mode 100644 index 0000000..dcc61bd --- /dev/null +++ b/readable.1.md @@ -0,0 +1,148 @@ +# readability-cli(1) -- get useful text from a web page + +## SYNOPSYS + +**readable** *[SOURCE]* *[options]...* + +## DESCRIPTION + +**readability-cli** takes any HTML page and strips out unnecessary bloat, leaving only the core text content. The resulting HTML may be suitable for terminal browsers, text readers, and other uses. + +This package provides the **readable** command, which uses Mozilla's Readability library. The same library is used in Firefox's Reader View. + +## OPTIONS + +The *SOURCE* can be a URL, a file, or '-' for standard input. + +`--help` + +* Show help message, and exit. + +`-b`, `--base` *URL* + +* Specify the document's URL. This affects relative links: they will not work if **readability-cli** does not know the base URL. You only need this option if you read HTML from a local file, or from standard input. + +`-i`, `--insane` + +* Don't sanitize HTML. + +`-K`, `--insecure` + +* Allow invalid SSL certificates. + +`-j`, `--json` + +* Output all known properties of the document as JSON (see **Properties** subsection). + +`-l`, `--low-confidence` *MODE* + +* What to do if Readability is uncertain about what the core content actually is. The possible modes are: + + * **keep** - When unsure, don't touch the HTML, output as-is. + * **force** - Process the document even when unsure (may produce really bad output). + * **exit** - When unsure, exit with an error. + +* The default value is **keep**. If the `--properties` or `--json` options are set, the program will always run in **exit** mode. + +`-C`, `--keep-classes` + +* Preserve CSS classes for input elements. By default, CSS classes are stripped, and the input is adapted for Firefox's Reader View. + +`-o`, `--output` *FILE* + +* Output the result to FILE. + +`-p`, `--properties` *PROPERTIES*... + +* Output specific properties of the document (see **Properties** subsection). + +`-x`, `--proxy` *URL* + +* Use specified proxy (can also use `HTTPS_PROXY` environment variable). + +`-q`, `--quiet` + +* Don't print extra information. + +`-s`, `--style` + +* Specify *.css* file for stylesheet. + +`-A`, `--user-agent` *STRING* + +* Set custom user agent string. + +`-V`, `--version` + +* Print **readability-cli** and Node.js version, then exit. + +`--completion` + +* Print script for shell completion, and exit. Provides Zsh completion if the current shell is zsh, otherwise provides Bash completion. + +### Properties + +The `--properties` option accepts a list of values, separated by spaces. Suitable values are: + +* **title** - The title of the article. +* **html-title** - The title of the article, wrapped in an `

` tag. +* **excerpt** - Article description, or short excerpt from the content. +* **byline** - Data about the page's author. +* **length** - Length of the article in characters. +* **dir** - Text direction, is either "ltr" for left-to-right or "rtl" for right-to-left. +* **text-content** - Output the article's main content as plain text. +* **html-content** - Output the article's main content as an HTML body. + +Properties are printed line by line, in the order specified by the user. Only "text-content" and "html-content" is printed as multiple lines. + +## EXIT STATUS + +As usual, exit code 0 indicates success, and anything other than 0 is an error. **readability-cli** uses standard`*` error codes: + +| Error code | Meaning | +| --: | :-- | +| **64** | Bad CLI arguments | +| **65** | Data format error: can't parse document using Readability. | +| **66** | No input | +| **68** | Unknown host name for URL | +| **77** | Permission denied: can't read file | + +`*` By "standard error codes" I mean "close to a standard". And by that I mean: I actually don't remember any command line tools which use this convention. You may find more info in **sysexits**(3), or maybe just *sysexits.h*. + +## ENVIRONMENT + +**readability-cli** supports localization, using the environment variables `LC_ALL`, `LC_MESSAGES`, `LANG` and `LANGUAGE`, in that order. Only one language at a time is supported. + +`HTTPS_PROXY` will set the HTTPS proxy, as previously stated, however the `--proxy` option overrides this. Lowercase `https_proxy` and `http_proxy` are also recognized. + +## EXAMPLE + +**Read HTML from a file and output the result to the console:** + +``` +readable index.html +``` + +**Fetch a random Wikipedia article, get its title and an excerpt:** + +``` +readable https://en.wikipedia.org/wiki/Special:Random -p title,excerpt +``` + +**Fetch a web page and read it in W3M:** + +``` +readable https://www.nytimes.com/2020/01/18/technology/clearview-privacy-facial-recognition.html | w3m -T text/html +``` + +**Download a web page using cURL, parse it and output as JSON:** + +``` +curl https://github.com/mozilla/readability | readable --base=https://github.com/mozilla/readability --json +``` + +## SEE ALSO + +**curl**(1), **w3m**(1), **sysexits**(3) + +Source code, license, bug tracker and merge requests may be found on [GitLab](https://gitlab.com/gardenappl/readability-cli).