py/gadict_html.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sat, 22 Jul 2023 00:13:47 +0300
changeset 1351 ae963ee79b49
parent 1319 679972640f47
permissions -rw-r--r--
Added support for 25k freq wordlist for dictionary as an HTML page.

# -*- coding: utf-8 -*-
"""HTML format writer"""

import io
import sys
import codecs
import re
from xml.sax.saxutils import escape

import gadict
import gadict_freq


FINAME = None
FONAME = None
LANGS = None
FREQ_SOURCES = []

# -lang:ru,uk
ARG_LANG_RE = re.compile("-lang:(.+)")
# -freq:var:TAG=FILE or -freq:freq:TAG=FILE
ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")
# -grp:GLOB
ARG_GRP_RE = re.compile("-grp:(.+)")

look_for_files = False
for idx in range(1, len(sys.argv)):
    arg = sys.argv[idx]
    if arg == "--":
        look_for_files = True
        continue
    if not look_for_files:
        m = ARG_LANG_RE.match(arg)
        if m:
            LANGS = set(m.group(1).split(","))
            for lang in LANGS:
                if len(lang) != 2:
                    raise Exception("Incorrect language specification: '{:s}'".format(arg))
            continue
        m = ARG_FREQ_RE.match(arg)
        if m:
            mode = m.group(1)
            tag = m.group(2)
            fname = m.group(3)
            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
                if mode == "var":
                    parser = gadict_freq.WordformParser(stream)
                elif mode == "freq":
                    parser = gadict_freq.FreqlistParser(stream)
                else:
                    raise Exception("Unsupported mode: '{:s}'".format(mode))
                wlist = parser.parse()
            FREQ_SOURCES.append((tag, set(wlist)))
            continue
        m = ARG_GRP_RE.match(arg)
        if m:
            patt = m.group(1)
            parser = gadict_freq.WordformGroupParser(patt)
            for (tag, wset) in parser.parse().items():
                FREQ_SOURCES.append((tag, wset))
            continue
        if arg.startswith("-"):
            raise Exception("Unsupported option format: '{:s}'".format(arg))
    if not FINAME:
        FINAME = arg
        continue
    if not FONAME:
        FONAME = arg
        continue
    raise Exception("Superfluous argument: '{:s}'".format(arg))


FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")

PARSER = gadict.Parser()
try:
    DOM = PARSER.parse(FIN)
except gadict.ParseException as ex:
    sys.stdout.write(u"{:s}{:s}\n".format(FINAME, repr(ex)))
    if __debug__:
        import traceback
        traceback.print_exc()
    exit(1)
finally:
    FIN.close()

PRELUDE = DOM[0]


if FONAME is None:
    FOUT = sys.stdout
else:
    FOUT = codecs.open(FONAME, "w", "utf-8")

HTML_HEADER = """<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <title>{title}</title>
<style>
body {{ max-width: 30em; margin: 0 auto; }}
.article {{
  margin: .5em 0;
  border-bottom: 1px dotted brown;
}}
.sense {{ margin-top: .5em; }}
.head {{ color: black; font-weight: bold; }}
.pron {{ color: green; }}
.attr {{ color: magenta; }}
.pos {{ color: green; font: weight: bold; }}
.lang {{ color: brown; font: weight: bold; }}
.ex {{ font-style: italic; }}
.freq {{ color: red; font-family: monospace; }}
</style>
</head>
<body>
"""
HTML_FOOTER = """</body>
</html>
"""

FOUT.write(HTML_HEADER.format(title=PRELUDE.name))
FOUT.write(u"<h1>{} dictionary</h1>\n".format(escape(PRELUDE.name)))
# FOUT.write("<a href='{}'>Home page</a>")
# FOUT.write(" , ".join(PRELUDE.urls))
FOUT.write("<p>License: ")
FOUT.write(escape(", ".join(PRELUDE.licences)))
# FOUT.write("</p>\n<p>")
# FOUT.write(escape(PRELUDE.about))
FOUT.write("</p>\n")

def link(lst):
    buf = []
    for el in lst:
        el = escape(el)
        lnk = u"<a href='#{}'>{}</a>".format(el, el)
        buf.append(lnk)
    return "; ".join(buf)

for (headwords, translations) in DOM[1:]:
    identity = headwords[0].headword
    FOUT.write(u"<div id='{}' class='article'>\n".format(escape(identity)))
    for hw in headwords:
        FOUT.write("<div>")
        FOUT.write(u"<span class='head'>{}</span>".format(escape(hw.headword)))
        if hw.pron is not None:
            FOUT.write(" <span class='pron'>[")
            FOUT.write(escape(hw.pron))
            FOUT.write("]</span>")
        if len(hw.attrs) > 0:
            FOUT.write(" <span class='attr'>")
            l = [u"«" + x + u"»" for x in hw.attrs]
            l.sort()
            FOUT.write(", ".join(l))
            FOUT.write("</span>")
        if hw.homo is not None and len(hw.homo) > 0:
            FOUT.write(" homo: <span class='homo'>")
            FOUT.write(link(hw.homo))
            FOUT.write("</span>")
        FOUT.write("</div>\n")
    for sense in translations:
        if not sense:
            raise Exception("""Empty sense for article: """ + headwords.__iter__().__next__())
        FOUT.write("<div class='sense'>")
        if sense.pos:
            FOUT.write(u"<span class='pos'>«")
            FOUT.write(escape(sense.pos))
            FOUT.write(u"»</span> ")
            need_sep = False
            if sense.topic_list and len(sense.topic_list) > 0:
                FOUT.write("<span class='topic'>")
                FOUT.write(escape(", ".join(sense.topic_list)))
                FOUT.write("</span>")
                need_sep = True
            if sense.ant_list and len(sense.ant_list) > 0:
                FOUT.write(" ant: <span class='ant'>")
                FOUT.write(link(sense.ant_list))
                FOUT.write("</span>")
                need_sep = True
            if sense.syn_list and len(sense.syn_list) > 0:
                FOUT.write(" syn: <span class='syn'>")
                FOUT.write(link(sense.syn_list))
                FOUT.write("</span>")
                need_sep = True
            if sense.hyper_list and len(sense.hyper_list) > 0:
                FOUT.write(" hyper: <span class='hyper'>")
                FOUT.write(link(sense.hyper_list))
                FOUT.write("</span>")
                need_sep = True
            if sense.hypo_list and len(sense.hypo_list) > 0:
                FOUT.write(" hypo: <span class='hypo'>")
                FOUT.write(link(sense.hypo_list))
                FOUT.write("</span>")
                need_sep = True
            if sense.rel_list and len(sense.rel_list) > 0:
                FOUT.write(" see: <span class='see'>")
                FOUT.write(link(sense.rel_list))
                FOUT.write("</span>")
        for (lang, tr) in sense.tr_list or []:
            if LANGS and lang not in LANGS:
                continue
            FOUT.write("<div>")
            if not LANGS or len(LANGS) > 1:
                FOUT.write(u"<span class='lang tr'>{}</span> ".format(escape(lang)))
            FOUT.write(u"<span class='tr'>{}</span>".format(escape(tr)))
            FOUT.write("</div>")
        for (lang, tr) in sense.glos_list or []:
            if LANGS and lang not in LANGS:
                continue
            FOUT.write("<div>")
            if not LANGS or len(LANGS) > 1:
                FOUT.write(u"<span class='lang glos'>{}</span> ".format(escape(lang)))
            FOUT.write(u"<span class='glos'>{}</span>".format(escape(tr)))
            FOUT.write("</div>")
        for (lang, tr) in sense.ex_list or []:
            if LANGS and lang not in LANGS:
                continue
            FOUT.write("<div>")
            if not LANGS or len(LANGS) > 1:
                FOUT.write(u"<span class='lang ex'>{}</span> ".format(escape(lang)))
            FOUT.write(u"<span class='ex'>{}</span>".format(escape(tr)))
            FOUT.write("</div>")
        FOUT.write("</div>")
    freqtags = []
    for (freqtag, freqset) in FREQ_SOURCES:
        if identity in freqset:
            freqtags.append(freqtag)
    if len(freqtags) > 0:
        FOUT.write("<p class='freq'>")
        FOUT.write(escape(",".join(freqtags)))
        FOUT.write("</p>")
    FOUT.write("</div>")

FOUT.write(HTML_FOOTER)
FOUT.close()