# meli - scripts/make_html_manual_page.py # # Copyright 2023 Manos Pitsidianakis # # This file is part of meli. # # meli is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # meli is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with meli. If not, see . import subprocess from threading import Timer from html.parser import HTMLParser import argparse import sys import re import shutil import textwrap from subprocess import PIPE from urllib.parse import urlparse from pathlib import Path import http.client from http import HTTPMethod, HTTPStatus import signal import functools from bs4 import BeautifulSoup NO_TTY = False def clear_line(signum, frame): if NO_TTY: return columns = shutil.get_terminal_size().columns message = chr(27) + "[0G" # go to start of line sys.stdout.write(message) message = " " * columns + "\r" sys.stdout.write(message) sys.stdout.flush() draw_progress.max_cols = 0 signal.signal(signal.SIGWINCH, clear_line) TEMPLATES = [ "http://linux.die.net/man/%S/%N", "http://man7.org/linux/man-pages/man%S/%N.%S.html", "http://manpages.debian.org/stable/%N.%S.en.html", "http://man.archlinux.org/man/%N.%S", "http://man.voidlinux.org/%N.%S", "http://man.bsd.lv/%N.%S", "http://man.bsd.lv/OpenBSD-7.0/%N.%S", "http://man.bsd.lv/FreeBSD-13.0/%N.%S", "http://man.bsd.lv/POSIX-2013/%N.%S", # last resorts "http://man.bsd.lv/UNIX-7/%N.%S", ] def add_progress(count=1): add_progress.count += count add_progress.count = 0.0 def draw_progress(total, count=None, status=""): if NO_TTY: return columns = shutil.get_terminal_size().columns status = textwrap.shorten( status, width=columns - draw_progress.bar_len - len("100.0% ...") - 8 ) if count is None: count = add_progress.count bar_len = draw_progress.bar_len filled_len = int(round(bar_len * count / float(total))) percents = round(100.0 * count / float(total), 1) bar = "=" * filled_len + "-" * (bar_len - filled_len) message = f"[{bar}] {percents}% ...{status}" draw_progress.max_cols = max(len(message) + 1, draw_progress.max_cols) spaces = " " * (draw_progress.max_cols - len(message)) message += f"{spaces}\r" sys.stdout.write(message) sys.stdout.flush() draw_progress.max_cols = 0 draw_progress.bar_len = 62 class ManFixer(HTMLParser): whitespace = r"\s{2,}" output = "" extract_href = False def reset(self): self.output = "" super().reset() def handle_starttag(self, tag, attrs): attrs = {a[0]: a[1] for a in attrs} if tag == "a" and self.extract_href and "href" in attrs: self.output += re.sub(self.whitespace, " ", attrs["href"]).replace( "\ufeff", "" ) self.output += " " def handle_endtag(self, tag): pass def handle_data(self, data): self.output += re.sub(self.whitespace, " ", data).replace("\ufeff", "") @staticmethod def extract(input_): parser = ManFixer() parser.feed(input_) return parser.output @functools.cache def give_me_head(url, url_, name, section): o = urlparse(url_) conn = http.client.HTTPSConnection(o.hostname, timeout=6) conn.request(HTTPMethod.HEAD, o.path) response = conn.getresponse() if ( response.status == HTTPStatus.FOUND or response.status == HTTPStatus.TEMPORARY_REDIRECT or response.status == HTTPStatus.PERMANENT_REDIRECT or response.status == HTTPStatus.MOVED_PERMANENTLY ): # print("for ", url_, "following redirect", response.status) give_me_head.redirects += 1 if give_me_head.redirects > 3: return None if response.getheader("Location"): # print("for ", url_, "following redirect to ", response.getheader("Location")) return give_me_head(url, response.getheader("Location"), name, section) print("bailout") return None # print("for ", url_, "code is ", response.status) give_me_head.redirects = 0 if response.status == http.HTTPStatus.OK: return url_ return None give_me_head.redirects = 0 def man_to_path(man: str) -> str: exp = r"(.+)(\d{1,})$" result = re.match(exp, man) if not result: return man return f"{result[2]}/{result[1][:-1]}" def draw_spinner(): if NO_TTY: return message = ( chr(27) + "[" + str(draw_spinner.columns - 1) + "C" + draw_spinner.frames[draw_spinner.index] + "\r" ) sys.stdout.write(message) sys.stdout.flush() draw_spinner.index += 1 draw_spinner.index = draw_spinner.index % len(draw_spinner.frames) draw_spinner.timer = Timer(draw_spinner.interval, draw_spinner) draw_spinner.timer.start() draw_spinner.interval = 0.1 draw_spinner.columns = shutil.get_terminal_size().columns draw_spinner.frames = ["|", "/", "-", "\\"] draw_spinner.index = 0 if __name__ == "__main__": draw_spinner.timer = Timer(draw_spinner.interval, draw_spinner) parser = argparse.ArgumentParser( description="Generates a element from a mdoc manpage." ) parser.add_argument("page", type=str, help="mdoc file") parser.add_argument( "--output", "-o", type=str, help="name of output file", required=False, default=None, ) parser.add_argument( "--name", "-n", type=str, help="name used for html IDs. defaults to file name stem.", required=False, default=None, ) parser.add_argument( "--refs", type=bool, help="find external manpages and hyperlink to them", required=False, default=False, ) parser.add_argument( "--no-tty", help="don't draw progress animation", required=False, default=False, action="store_true", ) parser.add_argument( "--exclude-refs", type=str, help="comma separated list of manpages to not hyperlink", required=False, default="", ) parser.add_argument( "--mandoc", type=str, help="alternative mandoc binary path", required=False, default="mandoc", ) args = parser.parse_args() if args.exclude_refs: args.exclude_refs = [s.strip() for s in args.exclude_refs.split(",")] if not args.output: args.output = Path.cwd() / (Path(args.page).name + ".html") if not args.name: args.name = Path(args.page).name manpage = open(args.page, "r", encoding="utf-8").read() if args.refs: refs_url = ',man="%N\t%S"' else: refs_url = "" NO_TTY = args.no_tty html_output = subprocess.run( f'{args.mandoc} -I os="rendered by mandoc" -Kutf-8 -Ofragment,toc,includes="#%I"{refs_url} -Thtml "{args.page}" | sed \'s/\s*<\/pre/<\/pre/\'', stdout=PIPE, shell=True, check=True, ).stdout.decode("utf-8") html_output = re.sub( r"(?:(?:[⟨])|(?:⟨))(.+)(?:(?:[⟩])|(?:⟩))", '\\1', html_output, flags=re.MULTILINE, ) soup = BeautifulSoup(html_output, "html.parser") targets = set() for target in soup.find_all(lambda tag: tag.has_attr("id")): id_ = target.get("id") targets.add(id_) root_table = next(soup.children) root_table["id"] = args.name if args.refs: total = len(soup.find_all("a")) print(f"Replacing `href` attributes in {total} hyperlinks...") draw_spinner.timer.start() for link in soup.find_all("a"): href = link.get("href") if href.startswith("#") and href[1:] in targets: link["href"] = "#" + args.name + "_" + href[1:] add_progress() draw_progress(total) elif href.startswith("#"): add_progress() draw_progress(total) else: exp = r"(.+)\t(.+)$" result = re.match(exp, href) if result: link["href"] = f"./{result[1]}.{result[2]}.html" name = result[1] section = result[2] if ( name in args.exclude_refs or f"{name}.{section}" in args.exclude_refs ): add_progress() draw_progress( total, status=f"{name}.{section}: Excluding ref because it is in --exclude-refs list. Leaving it as {link['href']}", ) continue found = False for url in TEMPLATES: add_progress(1.0 / (len(TEMPLATES) * 1.0)) if found: continue draw_progress( total, status=f"{name}.{section}: searching for an online mirror", ) url_ = url.replace("%N", name).replace("%S", section) try: got = give_me_head(url, url_, name, section) if got: link["href"] = got found = True continue except Exception as exc: if "handshake operation timed out" not in str(exc): print(f"got {exc} for url {url_}") else: add_progress() draw_progress(total) draw_spinner.timer.cancel() clear_line(None, None) for target in soup.find_all(lambda tag: tag.has_attr("id")): id_ = target.get("id") if id_ in targets: id_ = args.name + "_" + id_ target["id"] = id_ with open(args.output, "w", encoding="utf-8") as f: f.write( """ """ ) f.write(soup.prettify()) print("Written to ", args.output)