You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
readability-cli/readable.ts

132 lines
3.6 KiB
TypeScript

#!/usr/bin/env -S deno run --allow-net --allow-read --allow-write --allow-env=HTTPS_PROXY,LC_ALL,LC_MESSAGES,LANG,LANGUAGE --no-prompt --no-check
const version = "2.3.5"
import * as path from "https://deno.land/std@0.130.0/path/mod.ts"
import { Buffer } from "https://deno.land/std@0.130.0/node/buffer.ts"
import fs from "https://deno.land/std@0.130.0/node/fs.ts"
import process from "https://deno.land/std@0.130.0/node/process.ts"
import yargs from "https://deno.land/x/yargs@v17.4.0-deno/deno.ts"
import y18n from "https://deno.land/x/y18n@v5.0.8-deno/deno.ts"
import { initParser, DOMParser, DOMParserMimeType, Document, Element } from "https://deno.land/x/deno_dom@v0.1.21-alpha/deno-dom-wasm-noinit.ts"
import * as ammonia from "https://deno.land/x/ammonia@0.3.1/mod.ts"
import { Readability, isProbablyReaderable } from "https://esm.sh/@mozilla/readability@0.4.1?no-check"
import UserAgent from "https://esm.sh/user-agents@1.0.963"
// GNU gettext gives preference to LANGUAGE above all else, but this order is consistent with Yargs:
const locale = (
Deno.env.get("LC_ALL") ||
Deno.env.get("LC_MESSAGES") ||
Deno.env.get("LANG") ||
Deno.env.get("LANGUAGE") ||
"en_US"
).replace(/[.:].*/, '')
const __ = y18n({
locale: locale,
updateFiles: false,
directory: path.join(path.dirname(path.fromFileUrl(import.meta.url)), "locales")
}).__
function printVersion() {
console.log(`readability-cli v${version}`)
console.log(`Deno ${Deno.version.deno}`)
}
async function parseDOMFromURL(url: string, _proxy: string, _strictSSL: boolean, userAgent: string) {
const initParserPromise = initParser()
const userAgentString = userAgent ?? new UserAgent({ deviceCategory: "desktop" }).toString()
const response = await fetch(url, {
headers: {
"User-Agent": userAgentString
}
})
if (!response.ok) {
throw {
statusCode: response.status,
response: {
statusMessage: response.statusText
}
}
}
const text = await response.text()
await initParserPromise
const contentType = response.headers.get("Content-Type")!
let mimeType = contentType.slice(0, contentType.indexOf(';'))
if (mimeType == "text/htm")
mimeType = "text/html"
return parseDOM(text, url, mimeType as DOMParserMimeType)
}
async function parseDOM(html: string, url?: string, mimeType?: DOMParserMimeType) {
await initParser()
const document = new DOMParser().parseFromString(html, mimeType ?? "text/html")!
const baseURLString = document.getElementsByTagName("base")[0]?.getAttribute("href") ?? url
if (baseURLString) {
const baseURL = new URL(baseURLString)
const nodes: Element[] = []
nodes.push(document.documentElement!)
while (nodes.length > 0) {
const element = nodes.pop()!
const href = element.getAttribute("href")
if (href) {
try {
// Try to parse absolute URL
new URL(href)
} catch {
// Assume href is a relative URL
element.setAttribute("href", new URL(href, baseURL))
}
}
nodes.push(...element.children)
}
}
return [document]
}
async function parseDOMFromFile(file: string, url: string) {
const data = await Deno.readFile(file)
return parseDOM(new TextDecoder().decode(data), url)
}
async function sanitizeHTML(html: string) {
await ammonia.init()
return ammonia.clean(html)
}
async function sanitizeDOM(document: Document) {
return await sanitizeHTML(document.documentElement!.outerHTML)
}
import readable from "./common.mjs"
await readable(
Buffer,
fs,
process,
yargs(Deno.args),
__,
Readability,
isProbablyReaderable,
printVersion,
parseDOM,
parseDOMFromFile,
parseDOMFromURL,
sanitizeDOM,
sanitizeHTML
)
if (process.exitCode) {
process.exit()
}