#!/usr/bin/python3 # # gmi2html - convert Gemtext to HTML # # SPDX-FileCopyrightText: 2023 Daniel Kalak # SPDX-License-Identifier: GPL-3.0-or-later import sys # This aims to replace an enum. ( PRE_TOGGLE, PRE, H1, H2, H3, LIST, LINK, QUOTE_TEXT, QUOTE_GAP, BODY_TEXT, BODY_GAP ) = [i for i in range(11)] QUOTE = (QUOTE_TEXT, QUOTE_GAP) BODY = (BODY_TEXT, BODY_GAP) PRE_TOGGLE_PREFIX = "```" # ("", BODY) must come last because "" would match prematurely in categorize(). # Note that QUOTE and BODY don't need brackets because they already are tuples. PREFIXES = [ (PRE_TOGGLE_PREFIX, [PRE_TOGGLE]), ("# ", [H1]), ("## ", [H2]), ("### ", [H3]), ("* ", [LIST]), ("=>", [LINK]), (">", QUOTE), ("", BODY) ] # ("&", "&") must come first so that "&" isn't replaced in the other # replacements in escape(). ESCAPES = [ ("&", "&"), ("<", "<"), (">", ">"), ("\"", """), ("\'", "'") ] def escape(s): for escape in ESCAPES: s = s.replace(escape[0], escape[1]) return s def categorize(line, pre): if pre and not line.startswith(PRE_TOGGLE_PREFIX): return (PRE, escape(line[:-1])) # The loop only returns if PREFIXES contains ("", _). for prefix, categories in PREFIXES: if line.startswith(prefix): line = line[len(prefix):] # line.isspace() works on empty lines because line[-1] is still "\n". # categories[-1] works independently of whether categories has 1 or 2 # elements. category = categories[0] if not line.isspace() else categories[-1] return (category, escape(line[:-1])) def close_tags(prev, curr, pre): # (prev == PRE) instead of (pre) doesn't work because there could be 2 # adjacent PRE_TOGGLE lines with no PRE lines inbetween. if pre and curr == PRE_TOGGLE: sys.stdout.write("\n") elif prev == LIST and curr != LIST: sys.stdout.write("\n") elif prev in QUOTE and curr not in QUOTE: sys.stdout.write("\n") def open_tags(prev, curr, pre): # (prev != PRE) instead of (not pre) doesn't work because there could be 2 # adjacent PRE_TOGGLE lines after a PRE line with no PRE lines inbetween. if not pre and curr == PRE_TOGGLE: sys.stdout.write("
\n")
elif prev != LIST and curr == LIST:
sys.stdout.write("\n") def print_line(prev, curr, escaped_line): if curr == PRE: sys.stdout.write(escaped_line) elif curr == H1: sys.stdout.write("{}
".format(escaped_line)) elif curr == H2: sys.stdout.write("{}
".format(escaped_line)) elif curr == H3: sys.stdout.write("{}
".format(escaped_line)) elif curr == LIST: sys.stdout.write("- {}".format(escaped_line)) elif curr == LINK: sys.stdout.write("
" if prev != curr else "
") # escaped_line.removeprefix(" ") isn't necessary here because split() # covers that case. parts = escaped_line.split(maxsplit=1) # parts[-1] works independently of whether parts has 1 or 2 elements. url, description = parts[0], parts[-1] sys.stdout.write("{}".format(url, description)) elif curr == QUOTE_TEXT: sys.stdout.write("" if prev != curr else "
") sys.stdout.write(escaped_line.removeprefix(" ")) elif curr == BODY_TEXT: sys.stdout.write("" if prev != curr else "
") sys.stdout.write(escaped_line) sys.stdout.write("\n") def print_error(i, msg): return sys.stderr.write("Line {}: {}\n".format(i, msg)) # Return True on error, False otherwise. def check_line_sequence(i, prev, curr): error_bytes = 0 if prev == BODY_GAP and curr == BODY_GAP: error_bytes += print_error(i, "consecutive empty lines") elif prev == QUOTE_GAP and curr == QUOTE_GAP: error_bytes += print_error(i, "consecutive empty lines in quote") elif prev not in QUOTE and curr == QUOTE_GAP: error_bytes += print_error(i, "first line in quote is empty") elif prev == QUOTE_GAP and curr not in QUOTE: error_bytes += print_error(i-1, "last line in quote is empty") elif prev == None and curr == BODY_GAP: error_bytes += print_error(i, "first line is empty") elif prev == BODY_GAP and curr == None: error_bytes += print_error(i-1, "last line is empty") elif prev == PRE_TOGGLE and curr == PRE_TOGGLE: error_bytes += print_error(i, "code block is empty") if prev not in (BODY_GAP, None) and ( (curr == PRE_TOGGLE and prev != PRE) or curr in (H1, H2, H3) or (curr in (LIST, LINK, BODY_TEXT) and prev != curr) or (curr in QUOTE and prev not in QUOTE) ): error_bytes += print_error(i, "empty line missing") return error_bytes > 0 # Return True on error, False otherwise. def check_character_sequence(i, curr, escaped_line): error_bytes = 0 if curr == PRE_TOGGLE and escaped_line != "": error_bytes += print_error(i, "text after ```") # This if statement doesn’t cover cases where the line contains only # whitespace (which technically should also count as "no content"), but those # cases are covered by the if statement after it that checks against trailing # whitespace. if escaped_line == "" and curr in (H1, H2, H3, LINK, LIST): error_bytes += print_error(i, "no content") if escaped_line != "" and escaped_line[-1].isspace(): error_bytes += print_error(i, "trailing whitespace") if curr in (LINK, QUOTE_TEXT): if escaped_line != "" and escaped_line[0] != " ": error_bytes += print_error(i, "missing space after >") else: escaped_line = escaped_line[1:] if escaped_line != "" and escaped_line[0].isspace() and curr != PRE: error_bytes += print_error(i, "leading whitespace") return error_bytes > 0 i = 0 prev, curr = None, None pre = False errors = False for line in sys.stdin: i += 1 prev = curr curr, escaped_line = categorize(line, pre) close_tags(prev, curr, pre) open_tags(prev, curr, pre) if curr == PRE_TOGGLE: pre = not pre else: print_line(prev, curr, escaped_line) errors |= check_line_sequence(i, prev, curr) errors |= check_character_sequence(i, curr, escaped_line) close_tags(curr, None, pre) errors |= check_line_sequence(i+1, curr, None) sys.exit(1 if errors else 0)