meli/scripts/make_html_manual_page.py

# meli - scripts/make_html_manual_page.py
#
# Copyright 2023 Manos Pitsidianakis
#
# This file is part of meli.
#
# meli is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# meli is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with meli. If not, see <http://www.gnu.org/licenses/>.

import subprocess
from threading import Timer
from html.parser import HTMLParser
import argparse
import sys
import re
import shutil
import textwrap
from subprocess import PIPE
from urllib.parse import urlparse
from pathlib import Path
import http.client
from http import HTTPMethod, HTTPStatus
import signal
import functools
from bs4 import BeautifulSoup

NO_TTY = False


def clear_line(signum, frame):
    if NO_TTY:
        return
    columns = shutil.get_terminal_size().columns
    message = chr(27) + "[0G"  # go to start of line
    sys.stdout.write(message)
    message = " " * columns + "\r"
    sys.stdout.write(message)
    sys.stdout.flush()
    draw_progress.max_cols = 0


signal.signal(signal.SIGWINCH, clear_line)

TEMPLATES = [
    "http://linux.die.net/man/%S/%N",
    "http://man7.org/linux/man-pages/man%S/%N.%S.html",
    "http://manpages.debian.org/stable/%N.%S.en.html",
    "http://man.archlinux.org/man/%N.%S",
    "http://man.voidlinux.org/%N.%S",
    "http://man.bsd.lv/%N.%S",
    "http://man.bsd.lv/OpenBSD-7.0/%N.%S",
    "http://man.bsd.lv/FreeBSD-13.0/%N.%S",
    "http://man.bsd.lv/POSIX-2013/%N.%S",  # last resorts
    "http://man.bsd.lv/UNIX-7/%N.%S",
]


def add_progress(count=1):
    add_progress.count += count


add_progress.count = 0.0


def draw_progress(total, count=None, status=""):
    if NO_TTY:
        return
    columns = shutil.get_terminal_size().columns
    status = textwrap.shorten(
        status, width=columns - draw_progress.bar_len - len("100.0% ...") - 8
    )
    if count is None:
        count = add_progress.count
    bar_len = draw_progress.bar_len
    filled_len = int(round(bar_len * count / float(total)))

    percents = round(100.0 * count / float(total), 1)
    bar = "=" * filled_len + "-" * (bar_len - filled_len)

    message = f"[{bar}] {percents}% ...{status}"
    draw_progress.max_cols = max(len(message) + 1, draw_progress.max_cols)
    spaces = " " * (draw_progress.max_cols - len(message))
    message += f"{spaces}\r"
    sys.stdout.write(message)
    sys.stdout.flush()


draw_progress.max_cols = 0
draw_progress.bar_len = 62


class ManFixer(HTMLParser):
    whitespace = r"\s{2,}"
    output = ""
    extract_href = False

    def reset(self):
        self.output = ""
        super().reset()

    def handle_starttag(self, tag, attrs):
        attrs = {a[0]: a[1] for a in attrs}
        if tag == "a" and self.extract_href and "href" in attrs:
            self.output += re.sub(self.whitespace, " ", attrs["href"]).replace(
                "\ufeff", ""
            )
            self.output += " "

    def handle_endtag(self, tag):
        pass

    def handle_data(self, data):
        self.output += re.sub(self.whitespace, " ", data).replace("\ufeff", "")

    @staticmethod
    def extract(input_):
        parser = ManFixer()
        parser.feed(input_)
        return parser.output


@functools.cache
def give_me_head(url, url_, name, section):
    o = urlparse(url_)
    conn = http.client.HTTPSConnection(o.hostname, timeout=6)
    conn.request(HTTPMethod.HEAD, o.path)
    response = conn.getresponse()
    if (
        response.status == HTTPStatus.FOUND
        or response.status == HTTPStatus.TEMPORARY_REDIRECT
        or response.status == HTTPStatus.PERMANENT_REDIRECT
        or response.status == HTTPStatus.MOVED_PERMANENTLY
    ):
        # print("for ", url_, "following redirect", response.status)
        give_me_head.redirects += 1
        if give_me_head.redirects > 3:
            return None
        if response.getheader("Location"):
            # print("for ", url_, "following redirect to ", response.getheader("Location"))
            return give_me_head(url, response.getheader("Location"), name, section)
        print("bailout")
        return None
    # print("for ", url_, "code is ", response.status)
    give_me_head.redirects = 0
    if response.status == http.HTTPStatus.OK:
        return url_
    return None


give_me_head.redirects = 0


def man_to_path(man: str) -> str:
    exp = r"(.+)(\d{1,})$"
    result = re.match(exp, man)
    if not result:
        return man
    return f"{result[2]}/{result[1][:-1]}"


def draw_spinner():
    if NO_TTY:
        return
    message = (
        chr(27)
        + "["
        + str(draw_spinner.columns - 1)
        + "C"
        + draw_spinner.frames[draw_spinner.index]
        + "\r"
    )
    sys.stdout.write(message)
    sys.stdout.flush()
    draw_spinner.index += 1
    draw_spinner.index = draw_spinner.index % len(draw_spinner.frames)
    draw_spinner.timer = Timer(draw_spinner.interval, draw_spinner)
    draw_spinner.timer.start()


draw_spinner.interval = 0.1
draw_spinner.columns = shutil.get_terminal_size().columns
draw_spinner.frames = ["|", "/", "-", "\\"]
draw_spinner.index = 0

if __name__ == "__main__":
    draw_spinner.timer = Timer(draw_spinner.interval, draw_spinner)
    parser = argparse.ArgumentParser(
        description="Generates a <table> element from a mdoc manpage."
    )
    parser.add_argument("page", type=str, help="mdoc file")
    parser.add_argument(
        "--output",
        "-o",
        type=str,
        help="name of output file",
        required=False,
        default=None,
    )
    parser.add_argument(
        "--name",
        "-n",
        type=str,
        help="name used for html IDs. defaults to file name stem.",
        required=False,
        default=None,
    )
    parser.add_argument(
        "--refs",
        type=bool,
        help="find external manpages and hyperlink to them",
        required=False,
        default=False,
    )
    parser.add_argument(
        "--no-tty",
        help="don't draw progress animation",
        required=False,
        default=False,
        action="store_true",
    )
    parser.add_argument(
        "--exclude-refs",
        type=str,
        help="comma separated list of manpages to not hyperlink",
        required=False,
        default="",
    )
    parser.add_argument(
        "--mandoc",
        type=str,
        help="alternative mandoc binary path",
        required=False,
        default="mandoc",
    )

    args = parser.parse_args()
    if args.exclude_refs:
        args.exclude_refs = [s.strip() for s in args.exclude_refs.split(",")]
    if not args.output:
        args.output = Path.cwd() / (Path(args.page).name + ".html")
    if not args.name:
        args.name = Path(args.page).name

    manpage = open(args.page, "r", encoding="utf-8").read()
    if args.refs:
        refs_url = ',man="%N\t%S"'
    else:
        refs_url = ""

    NO_TTY = args.no_tty
    html_output = subprocess.run(
        f'{args.mandoc} -I os="rendered by mandoc" -Kutf-8 -Ofragment,toc,includes="#%I"{refs_url} -Thtml "{args.page}" | sed \'s/\s*<\/pre/<\/pre/\'',
        stdout=PIPE,
        shell=True,
        check=True,
    ).stdout.decode("utf-8")

    html_output = re.sub(
        r"(?:(?:[⟨])|(?:&#x27E8;))(.+)(?:(?:[⟩])|(?:&#x27E9;))",
        '<kbd class="manpage-kbd">\\1</kbd>',
        html_output,
        flags=re.MULTILINE,
    )

    soup = BeautifulSoup(html_output, "html.parser")
    targets = set()
    for target in soup.find_all(lambda tag: tag.has_attr("id")):
        id_ = target.get("id")
        targets.add(id_)
    root_table = next(soup.children)
    root_table["id"] = args.name

    if args.refs:
        total = len(soup.find_all("a"))
        print(f"Replacing `href` attributes in {total} hyperlinks...")

        draw_spinner.timer.start()
        for link in soup.find_all("a"):
            href = link.get("href")
            if href.startswith("#") and href[1:] in targets:
                link["href"] = "#" + args.name + "_" + href[1:]
                add_progress()
                draw_progress(total)
            elif href.startswith("#"):
                add_progress()
                draw_progress(total)
            else:
                exp = r"(.+)\t(.+)$"
                result = re.match(exp, href)
                if result:
                    link["href"] = f"./{result[1]}.{result[2]}.html"
                    name = result[1]
                    section = result[2]
                    if (
                        name in args.exclude_refs
                        or f"{name}.{section}" in args.exclude_refs
                    ):
                        add_progress()
                        draw_progress(
                            total,
                            status=f"{name}.{section}: Excluding ref because it is in --exclude-refs list. Leaving it as {link['href']}",
                        )
                        continue
                    found = False
                    for url in TEMPLATES:
                        add_progress(1.0 / (len(TEMPLATES) * 1.0))
                        if found:
                            continue
                        draw_progress(
                            total,
                            status=f"{name}.{section}: searching for an online mirror",
                        )
                        url_ = url.replace("%N", name).replace("%S", section)
                        try:
                            got = give_me_head(url, url_, name, section)
                            if got:
                                link["href"] = got
                                found = True
                                continue
                        except Exception as exc:
                            if "handshake operation timed out" not in str(exc):
                                print(f"got {exc} for url {url_}")
                else:
                    add_progress()
                    draw_progress(total)
        draw_spinner.timer.cancel()
        clear_line(None, None)

    for target in soup.find_all(lambda tag: tag.has_attr("id")):
        id_ = target.get("id")
        if id_ in targets:
            id_ = args.name + "_" + id_
            target["id"] = id_

    with open(args.output, "w", encoding="utf-8") as f:
        f.write(
            """
        <style>
        code.Ic, code.Li, code.Cm, code.Nm, kbd.manpage-kbd{
            display: inline-block;
        }
        kbd {
          background-color: #eee;
          border-radius: 3px;
          border: 1px solid #b4b4b4;
          box-shadow:
            0 1px 1px rgba(0, 0, 0, 0.2),
            0 2px 0 0 rgba(255, 255, 255, 0.7) inset;
          color: #333;
          display: inline-block;
          font-size: 0.85em;
          font-weight: 700;
          line-height: 1;
          padding: 2px 4px;
          white-space: nowrap;
        }
        code {
          background-color: #eee;
          border-radius: 3px;
          font-family: courier, monospace;
          padding: 0 3px;
        }
        </style>
        """
        )
        f.write(soup.prettify())
    print("Written to ", args.output)
Add scripts/make_html_manual_page.py Signed-off-by: Manos Pitsidianakis <manos@pitsidianak.is> 2023-08-29 11:53:11 +00:00			`# meli - scripts/make_html_manual_page.py`
			`#`
			`# Copyright 2023 Manos Pitsidianakis`
			`#`
			`# This file is part of meli.`
			`#`
			`# meli is free software: you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License as published by`
			`# the Free Software Foundation, either version 3 of the License, or`
			`# (at your option) any later version.`
			`#`
			`# meli is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU General Public License`
			`# along with meli. If not, see <http://www.gnu.org/licenses/>.`

			`import subprocess`
			`from threading import Timer`
			`from html.parser import HTMLParser`
			`import argparse`
			`import sys`
			`import re`
			`import shutil`
			`import textwrap`
			`from subprocess import PIPE`
			`from urllib.parse import urlparse`
			`from pathlib import Path`
			`import http.client`
			`from http import HTTPMethod, HTTPStatus`
			`import signal`
			`import functools`
			`from bs4 import BeautifulSoup`

			`NO_TTY = False`


			`def clear_line(signum, frame):`
			`if NO_TTY:`
			`return`
			`columns = shutil.get_terminal_size().columns`
			`message = chr(27) + "[0G" # go to start of line`
			`sys.stdout.write(message)`
			`message = " " * columns + "\r"`
			`sys.stdout.write(message)`
			`sys.stdout.flush()`
			`draw_progress.max_cols = 0`


			`signal.signal(signal.SIGWINCH, clear_line)`

			`TEMPLATES = [`
			`"http://linux.die.net/man/%S/%N",`
			`"http://man7.org/linux/man-pages/man%S/%N.%S.html",`
			`"http://manpages.debian.org/stable/%N.%S.en.html",`
			`"http://man.archlinux.org/man/%N.%S",`
			`"http://man.voidlinux.org/%N.%S",`
			`"http://man.bsd.lv/%N.%S",`
			`"http://man.bsd.lv/OpenBSD-7.0/%N.%S",`
			`"http://man.bsd.lv/FreeBSD-13.0/%N.%S",`
			`"http://man.bsd.lv/POSIX-2013/%N.%S", # last resorts`
			`"http://man.bsd.lv/UNIX-7/%N.%S",`
			`]`


			`def add_progress(count=1):`
			`add_progress.count += count`


			`add_progress.count = 0.0`


			`def draw_progress(total, count=None, status=""):`
			`if NO_TTY:`
			`return`
			`columns = shutil.get_terminal_size().columns`
			`status = textwrap.shorten(`
			`status, width=columns - draw_progress.bar_len - len("100.0% ...") - 8`
			`)`
			`if count is None:`
			`count = add_progress.count`
			`bar_len = draw_progress.bar_len`
			`filled_len = int(round(bar_len * count / float(total)))`

			`percents = round(100.0 * count / float(total), 1)`
			`bar = "=" * filled_len + "-" * (bar_len - filled_len)`

			`message = f"[{bar}] {percents}% ...{status}"`
			`draw_progress.max_cols = max(len(message) + 1, draw_progress.max_cols)`
			`spaces = " " * (draw_progress.max_cols - len(message))`
			`message += f"{spaces}\r"`
			`sys.stdout.write(message)`
			`sys.stdout.flush()`


			`draw_progress.max_cols = 0`
			`draw_progress.bar_len = 62`


			`class ManFixer(HTMLParser):`
			`whitespace = r"\s{2,}"`
			`output = ""`
			`extract_href = False`

			`def reset(self):`
			`self.output = ""`
			`super().reset()`

			`def handle_starttag(self, tag, attrs):`
			`attrs = {a[0]: a[1] for a in attrs}`
			`if tag == "a" and self.extract_href and "href" in attrs:`
			`self.output += re.sub(self.whitespace, " ", attrs["href"]).replace(`
			`"\ufeff", ""`
			`)`
			`self.output += " "`

			`def handle_endtag(self, tag):`
			`pass`

			`def handle_data(self, data):`
			`self.output += re.sub(self.whitespace, " ", data).replace("\ufeff", "")`

			`@staticmethod`
			`def extract(input_):`
			`parser = ManFixer()`
			`parser.feed(input_)`
			`return parser.output`


			`@functools.cache`
			`def give_me_head(url, url_, name, section):`
			`o = urlparse(url_)`
			`conn = http.client.HTTPSConnection(o.hostname, timeout=6)`
			`conn.request(HTTPMethod.HEAD, o.path)`
			`response = conn.getresponse()`
			`if (`
			`response.status == HTTPStatus.FOUND`
			`or response.status == HTTPStatus.TEMPORARY_REDIRECT`
			`or response.status == HTTPStatus.PERMANENT_REDIRECT`
			`or response.status == HTTPStatus.MOVED_PERMANENTLY`
			`):`
			`# print("for ", url_, "following redirect", response.status)`
			`give_me_head.redirects += 1`
			`if give_me_head.redirects > 3:`
			`return None`
			`if response.getheader("Location"):`
			`# print("for ", url_, "following redirect to ", response.getheader("Location"))`
			`return give_me_head(url, response.getheader("Location"), name, section)`
			`print("bailout")`
			`return None`
			`# print("for ", url_, "code is ", response.status)`
			`give_me_head.redirects = 0`
			`if response.status == http.HTTPStatus.OK:`
			`return url_`
			`return None`


			`give_me_head.redirects = 0`


			`def man_to_path(man: str) -> str:`
			`exp = r"(.+)(\d{1,})$"`
			`result = re.match(exp, man)`
			`if not result:`
			`return man`
			`return f"{result[2]}/{result[1][:-1]}"`


			`def draw_spinner():`
			`if NO_TTY:`
			`return`
			`message = (`
			`chr(27)`
			`+ "["`
			`+ str(draw_spinner.columns - 1)`
			`+ "C"`
			`+ draw_spinner.frames[draw_spinner.index]`
			`+ "\r"`
			`)`
			`sys.stdout.write(message)`
			`sys.stdout.flush()`
			`draw_spinner.index += 1`
			`draw_spinner.index = draw_spinner.index % len(draw_spinner.frames)`
			`draw_spinner.timer = Timer(draw_spinner.interval, draw_spinner)`
			`draw_spinner.timer.start()`


			`draw_spinner.interval = 0.1`
			`draw_spinner.columns = shutil.get_terminal_size().columns`
			`draw_spinner.frames = ["\|", "/", "-", "\\"]`
			`draw_spinner.index = 0`

			`if __name__ == "__main__":`
			`draw_spinner.timer = Timer(draw_spinner.interval, draw_spinner)`
			`parser = argparse.ArgumentParser(`
			`description="Generates a <table> element from a mdoc manpage."`
			`)`
			`parser.add_argument("page", type=str, help="mdoc file")`
			`parser.add_argument(`
			`"--output",`
			`"-o",`
			`type=str,`
			`help="name of output file",`
			`required=False,`
			`default=None,`
			`)`
			`parser.add_argument(`
			`"--name",`
			`"-n",`
			`type=str,`
			`help="name used for html IDs. defaults to file name stem.",`
			`required=False,`
			`default=None,`
			`)`
			`parser.add_argument(`
			`"--refs",`
			`type=bool,`
			`help="find external manpages and hyperlink to them",`
			`required=False,`
			`default=False,`
			`)`
			`parser.add_argument(`
			`"--no-tty",`
			`help="don't draw progress animation",`
			`required=False,`
			`default=False,`
			`action="store_true",`
			`)`
			`parser.add_argument(`
			`"--exclude-refs",`
			`type=str,`
			`help="comma separated list of manpages to not hyperlink",`
			`required=False,`
			`default="",`
			`)`
			`parser.add_argument(`
			`"--mandoc",`
			`type=str,`
			`help="alternative mandoc binary path",`
			`required=False,`
			`default="mandoc",`
			`)`

			`args = parser.parse_args()`
			`if args.exclude_refs:`
			`args.exclude_refs = [s.strip() for s in args.exclude_refs.split(",")]`
			`if not args.output:`
			`args.output = Path.cwd() / (Path(args.page).name + ".html")`
			`if not args.name:`
			`args.name = Path(args.page).name`

			`manpage = open(args.page, "r", encoding="utf-8").read()`
			`if args.refs:`
			`refs_url = ',man="%N\t%S"'`
			`else:`
			`refs_url = ""`

			`NO_TTY = args.no_tty`
			`html_output = subprocess.run(`
			`f'{args.mandoc} -I os="rendered by mandoc" -Kutf-8 -Ofragment,toc,includes="#%I"{refs_url} -Thtml "{args.page}" \| sed \'s/\s*<\/pre/<\/pre/\'',`
			`stdout=PIPE,`
			`shell=True,`
			`check=True,`
			`).stdout.decode("utf-8")`

			`html_output = re.sub(`
			`r"(?:(?:[⟨])\|(?:⟨))(.+)(?:(?:[⟩])\|(?:⟩))",`
			`'<kbd class="manpage-kbd">\\1</kbd>',`
			`html_output,`
			`flags=re.MULTILINE,`
			`)`

			`soup = BeautifulSoup(html_output, "html.parser")`
			`targets = set()`
			`for target in soup.find_all(lambda tag: tag.has_attr("id")):`
			`id_ = target.get("id")`
			`targets.add(id_)`
			`root_table = next(soup.children)`
			`root_table["id"] = args.name`

			`if args.refs:`
			`total = len(soup.find_all("a"))`
			print(f"Replacing `href` attributes in {total} hyperlinks...")

			`draw_spinner.timer.start()`
			`for link in soup.find_all("a"):`
			`href = link.get("href")`
			`if href.startswith("#") and href[1:] in targets:`
			`link["href"] = "#" + args.name + "_" + href[1:]`
			`add_progress()`
			`draw_progress(total)`
			`elif href.startswith("#"):`
			`add_progress()`
			`draw_progress(total)`
			`else:`
			`exp = r"(.+)\t(.+)$"`
			`result = re.match(exp, href)`
			`if result:`
			`link["href"] = f"./{result[1]}.{result[2]}.html"`
			`name = result[1]`
			`section = result[2]`
			`if (`
			`name in args.exclude_refs`
			`or f"{name}.{section}" in args.exclude_refs`
			`):`
			`add_progress()`
			`draw_progress(`
			`total,`
			`status=f"{name}.{section}: Excluding ref because it is in --exclude-refs list. Leaving it as {link['href']}",`
			`)`
			`continue`
			`found = False`
			`for url in TEMPLATES:`
			`add_progress(1.0 / (len(TEMPLATES) * 1.0))`
			`if found:`
			`continue`
			`draw_progress(`
			`total,`
			`status=f"{name}.{section}: searching for an online mirror",`
			`)`
			`url_ = url.replace("%N", name).replace("%S", section)`
			`try:`
			`got = give_me_head(url, url_, name, section)`
			`if got:`
			`link["href"] = got`
			`found = True`
			`continue`
			`except Exception as exc:`
			`if "handshake operation timed out" not in str(exc):`
			`print(f"got {exc} for url {url_}")`
			`else:`
			`add_progress()`
			`draw_progress(total)`
			`draw_spinner.timer.cancel()`
			`clear_line(None, None)`

			`for target in soup.find_all(lambda tag: tag.has_attr("id")):`
			`id_ = target.get("id")`
			`if id_ in targets:`
			`id_ = args.name + "_" + id_`
			`target["id"] = id_`

			`with open(args.output, "w", encoding="utf-8") as f:`
			`f.write(`
			`"""`
			`<style>`
			`code.Ic, code.Li, code.Cm, code.Nm, kbd.manpage-kbd{`
			`display: inline-block;`
			`}`
			`kbd {`
			`background-color: #eee;`
			`border-radius: 3px;`
			`border: 1px solid #b4b4b4;`
			`box-shadow:`
			`0 1px 1px rgba(0, 0, 0, 0.2),`
			`0 2px 0 0 rgba(255, 255, 255, 0.7) inset;`
			`color: #333;`
			`display: inline-block;`
			`font-size: 0.85em;`
			`font-weight: 700;`
			`line-height: 1;`
			`padding: 2px 4px;`
			`white-space: nowrap;`
			`}`
			`code {`
			`background-color: #eee;`
			`border-radius: 3px;`
			`font-family: courier, monospace;`
			`padding: 0 3px;`
			`}`
			`</style>`
			`"""`
			`)`
			`f.write(soup.prettify())`
			`print("Written to ", args.output)`