diff -r 6aa817943828 -r d57c28843156 py/gadict_html.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/py/gadict_html.py Fri Mar 03 16:13:37 2017 +0200 @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- +"""HTML format writer""" + +import io +import sys +import codecs +import re +import html + +import gadict +import gadict_freq + + +FINAME = None +FONAME = None +LANGS = None +FREQ_SOURCES = [] + +# -lang:ru,uk +ARG_LANG_RE = re.compile("-lang:(.+)") +# -freq:var:TAG=FILE or -freq:freq:TAG=FILE +ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)") + +look_for_files = False +for idx in range(1, len(sys.argv)): + arg = sys.argv[idx] + if arg == "--": + look_for_files = True + continue + if not look_for_files: + m = ARG_LANG_RE.match(arg) + if m: + LANGS = set(m.group(1).split(",")) + for lang in LANGS: + if len(lang) != 2: + raise Exception("Incorrect language specification: '{:s}'".format(arg)) + continue + m = ARG_FREQ_RE.match(arg) + if m: + mode = m.group(1) + tag = m.group(2) + fname = m.group(3) + with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream: + if mode == "var": + parser = gadict_freq.WordformParser(stream) + elif mode == "freq": + parser = gadict_freq.FreqlistParser(stream) + else: + raise Exception("Unsupported mode: '{:s}'".format(mode)) + wlist = parser.parse() + FREQ_SOURCES.append((tag, set(wlist))) + continue + if arg.startswith("-"): + raise Exception("Unsupported option format: '{:s}'".format(arg)) + if not FINAME: + FINAME = arg + continue + if not FONAME: + FONAME = arg + continue + raise Exception("Unnecessary argument: '{:s}'".format(arg)) + + +FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8") + +PARSER = gadict.Parser() +try: + DOM = PARSER.parse(FIN) +except gadict.ParseException as ex: + sys.stdout.write("{:s}{:s}\n".format(FINAME, repr(ex))) + if __debug__: + import traceback + traceback.print_exc() + exit(1) +finally: + FIN.close() + +PRELUDE = DOM[0] + + +if FONAME is None: + FOUT = sys.stdout +else: + FOUT = codecs.open(FONAME, "w", "utf-8") + +HTML_HEADER = """ + + + + {title} + + + +""" +HTML_FOOTER = """ + +""" + +FOUT.write(HTML_HEADER.format(title=PRELUDE.name)) +FOUT.write("

{} dictionary

\n".format(html.escape(PRELUDE.name))) +# FOUT.write("Home page") +# FOUT.write(" , ".join(PRELUDE.urls)) +FOUT.write("

License: ") +FOUT.write(html.escape(", ".join(PRELUDE.licences))) +# FOUT.write("

\n

") +# FOUT.write(html.escape(PRELUDE.about)) +FOUT.write("

\n") + +def link(lst): + buf = [] + for el in lst: + el = html.escape(el) + lnk = "{}".format(el, el) + buf.append(lnk) + return "; ".join(buf) + +for (headwords, translations) in DOM[1:]: + identity = headwords[0].headword + FOUT.write("
\n".format(html.escape(identity))) + for hw in headwords: + FOUT.write("
") + FOUT.write("{}".format(html.escape(hw.headword))) + if hw.pron is not None: + FOUT.write(" [") + FOUT.write(html.escape(hw.pron)) + FOUT.write("]") + if len(hw.attrs) > 0: + FOUT.write(" ") + l = ["«"+x+"»" for x in hw.attrs] + l.sort() + FOUT.write(", ".join(l)) + FOUT.write("") + FOUT.write("
\n") + for sense in translations: + if not sense: + raise Exception("""Empty sense for article: """ + headwords.__iter__().__next__()) + FOUT.write("
") + if sense.pos: + FOUT.write("«") + FOUT.write(html.escape(sense.pos)) + FOUT.write("» ") + need_sep = False + if sense.topic_list and len(sense.topic_list) > 0: + FOUT.write("") + FOUT.write(html.escape(", ".join(sense.topic_list))) + FOUT.write("") + need_sep = True + if sense.ant_list and len(sense.ant_list) > 0: + FOUT.write(" ant: ") + FOUT.write(link(sense.ant_list)) + FOUT.write("") + need_sep = True + if sense.syn_list and len(sense.syn_list) > 0: + FOUT.write(" syn: ") + FOUT.write(link(sense.syn_list)) + FOUT.write("") + need_sep = True + if sense.hyper_list and len(sense.hyper_list) > 0: + FOUT.write(" hyper: ") + FOUT.write(link(sense.hyper_list)) + FOUT.write("") + need_sep = True + if sense.hypo_list and len(sense.hypo_list) > 0: + FOUT.write(" hypo: ") + FOUT.write(link(sense.hypo_list)) + FOUT.write("") + need_sep = True + if sense.rel_list and len(sense.rel_list) > 0: + FOUT.write(" see: ") + FOUT.write(link(sense.rel_list)) + FOUT.write("") + for (lang, tr) in sense.tr_list or []: + if LANGS and lang not in LANGS: + continue + FOUT.write("
") + if not LANGS or len(LANGS) > 1: + FOUT.write("{} ".format(html.escape(lang))) + FOUT.write("{}".format(html.escape(tr))) + FOUT.write("
") + for (lang, tr) in sense.glos_list or []: + if LANGS and lang not in LANGS: + continue + FOUT.write("
") + if not LANGS or len(LANGS) > 1: + FOUT.write("{} ".format(html.escape(lang))) + FOUT.write("{}".format(html.escape(tr))) + FOUT.write("
") + for (lang, tr) in sense.ex_list or []: + if LANGS and lang not in LANGS: + continue + FOUT.write("
") + if not LANGS or len(LANGS) > 1: + FOUT.write("{} ".format(html.escape(lang))) + FOUT.write("{}".format(html.escape(tr))) + FOUT.write("
") + FOUT.write("
") + freqtags = [] + for (freqtag, freqset) in FREQ_SOURCES: + if identity in freqset: + freqtags.append(freqtag) + if len(freqtags) > 0: + FOUT.write("

") + FOUT.write(html.escape(",".join(freqtags))) + FOUT.write("

") + FOUT.write("
") + +FOUT.write(HTML_FOOTER) +FOUT.close()