py/gadict_c5.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Tue, 20 Mar 2018 23:09:15 +0200
changeset 1027 7bbe42d02ee2
parent 984 73d6e2631338
child 1315 6532512bbac4
permissions -rw-r--r--
Added new articles.

# -*- coding: utf-8 -*-
"""dictd C5 format writer"""

import io
import sys
import codecs
import re

import gadict
import gadict_freq


FINAME = None
FONAME = None
LANGS = None
FREQ_SOURCES = []

# -lang:ru,uk
ARG_LANG_RE = re.compile("-lang:(.+)")
# -freq:var:TAG=FILE or -freq:freq:TAG=FILE
ARG_FREQ_RE = re.compile("-freq:(freq|var):([^=]+)=(.+)")

look_for_files = False
for idx in range(1, len(sys.argv)):
    arg = sys.argv[idx]
    if arg == "--":
        look_for_files = True
        continue
    if not look_for_files:
        m = ARG_LANG_RE.match(arg)
        if m:
            LANGS = set(m.group(1).split(","))
            for lang in LANGS:
                if len(lang) != 2:
                    raise Exception("Incorrect language specification: '{:s}'".format(arg))
            continue
        m = ARG_FREQ_RE.match(arg)
        if m:
            mode = m.group(1)
            tag = m.group(2)
            fname = m.group(3)
            with io.open(fname, mode='r', buffering=1, encoding="utf-8") as stream:
                if mode == "var":
                    parser = gadict_freq.WordformParser(stream)
                elif mode == "freq":
                    parser = gadict_freq.FreqlistParser(stream)
                else:
                    raise Exception("Unsupported mode: '{:s}'".format(mode))
                wlist = parser.parse()
            FREQ_SOURCES.append((tag, set(wlist)))
            continue
        if arg.startswith("-"):
            raise Exception("Unsupported option format: '{:s}'".format(arg))
    if not FINAME:
        FINAME = arg
        continue
    if not FONAME:
        FONAME = arg
        continue
    raise Exception("Unnecessary argument: '{:s}'".format(arg))


FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")

PARSER = gadict.Parser()
try:
    DOM = PARSER.parse(FIN)
except gadict.ParseException as ex:
    sys.stdout.write("{:s}{:s}\n".format(FINAME, repr(ex)))
    if __debug__:
        import traceback
        traceback.print_exc()
    exit(1)
finally:
    FIN.close()


PRELUDE = DOM[0]

with open(FONAME+".name", "w") as f:        # for dictfmt -s
    if PRELUDE.name is not None:
        f.write(PRELUDE.name)
    f.write("\n")

with open(FONAME+".url", "w") as f:         # for dictfmt -u
    if len(PRELUDE.urls) > 0:
        f.write(PRELUDE.urls[0])
    f.write("\n")


if FONAME is None:
    FOUT = sys.stdout
else:
    FOUT = codecs.open(FONAME, "w", "utf-8")

if PRELUDE.name is not None:
    FOUT.write("Dictionary name: ")
    FOUT.write(PRELUDE.name)
    FOUT.write("\n\n")
FOUT.write("Project URLs: ")
FOUT.write(" , ".join(PRELUDE.urls))
FOUT.write("\n\n")
FOUT.write("Project licenses: ")
FOUT.write(", ".join(PRELUDE.licences))
FOUT.write("\n\n")
FOUT.write(PRELUDE.about)
FOUT.write("\n")


for (headwords, translations) in DOM[1:]:
    identity = headwords[0].headword
    FOUT.write("_____\n\n")
    title = "; ".join([h.headword for h in headwords])
    FOUT.write(title)
    FOUT.write("\n\n")
    for hw in headwords:
        FOUT.write(hw.headword)
        if hw.pron is not None:
            FOUT.write(" [")
            FOUT.write(hw.pron)
            FOUT.write("]")
        if len(hw.attrs) > 0:
            FOUT.write(" ")
            l = ["«"+x+"»" for x in hw.attrs]
            l.sort()
            FOUT.write(", ".join(l))
        if hw.homo is not None and len(hw.homo) > 0:
            FOUT.write(" homo: ")
            l = ["{"+x+"}" for x in hw.homo]
            l.sort()
            FOUT.write(", ".join(l))
        FOUT.write("\n")
    FOUT.write("\n")
    for sense in translations:
        if not sense:
            raise Exception("""Empty sense for article: """ + headwords.__iter__().__next__())
        if sense.pos:
            FOUT.write("● «")
            FOUT.write(sense.pos)
            FOUT.write("»")
            need_sep = False
            if sense.topic_list and len(sense.topic_list) > 0:
                FOUT.write(" topic: ")
                FOUT.write(", ".join(["{"+s+"}" for s in sense.topic_list]))
                need_sep = True
            if sense.ant_list and len(sense.ant_list) > 0:
                if need_sep:
                    FOUT.write(" |")
                FOUT.write(" ant: ")
                FOUT.write("; ".join(["{"+s+"}" for s in sense.ant_list]))
                need_sep = True
            if sense.syn_list and len(sense.syn_list) > 0:
                if need_sep:
                    FOUT.write(" |")
                FOUT.write(" syn: ")
                FOUT.write("; ".join(["{"+s+"}" for s in sense.syn_list]))
                need_sep = True
            if sense.hyper_list and len(sense.hyper_list) > 0:
                if need_sep:
                    FOUT.write(" |")
                FOUT.write(" hyper: ")
                FOUT.write("; ".join(["{"+s+"}" for s in sense.hyper_list]))
                need_sep = True
            if sense.hypo_list and len(sense.hypo_list) > 0:
                if need_sep:
                    FOUT.write(" |")
                FOUT.write(" hypo: ")
                FOUT.write("; ".join(["{"+s+"}" for s in sense.hypo_list]))
                need_sep = True
            if sense.col_list and len(sense.col_list) > 0:
                if need_sep:
                    FOUT.write(" |")
                FOUT.write(" col: ")
                FOUT.write("; ".join(["{"+s+"}" for s in sense.col_list]))
                need_sep = True
            if sense.rel_list and len(sense.rel_list) > 0:
                if need_sep:
                    FOUT.write(" |")
                FOUT.write(" see: ")
                FOUT.write("; ".join(["{"+s+"}" for s in sense.rel_list]))
            if not LANGS or len(LANGS) != 1:
                FOUT.write("\n")
        for (lang, tr) in sense.tr_list or []:
            if LANGS and lang not in LANGS:
                continue
            FOUT.write("  ")
            if not LANGS or len(LANGS) > 1:
                FOUT.write(lang)
                FOUT.write("→ ")
            FOUT.write(tr)
            FOUT.write("\n")
        for (lang, tr) in sense.glos_list or []:
            if LANGS and lang not in LANGS:
                continue
            FOUT.write("  ")
            if not LANGS or len(LANGS) > 1:
                FOUT.write(lang)
            FOUT.write("↦ ")
            FOUT.write(tr)
            FOUT.write("\n")
        for (lang, tr) in sense.ex_list or []:
            if LANGS and lang not in LANGS:
                continue
            FOUT.write("    ")
            if not LANGS or len(LANGS) > 1:
                FOUT.write(lang)
            FOUT.write("⇒ ")
            FOUT.write(tr)
            FOUT.write("\n")
    freqtags = []
    for (freqtag, freqset) in FREQ_SOURCES:
        if identity in freqset:
            freqtags.append(freqtag)
    if len(freqtags) > 0:
        FOUT.write(",".join(["{{{:s}}}".format(tag) for tag in freqtags]))
        FOUT.write("\n")