py/gadict_headwords.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Thu, 26 Jan 2017 22:40:25 +0200
changeset 742 5a5be84a113d
parent 720 b5a4b476eddf
child 757 5417f2102dc5
permissions -rw-r--r--
If `gadict-espeak-enabled' is `t' pronunciation will be filled with espeak `gadict-espeak-default-voice'.


import sys
import codecs
import io
import regex

FINAME = None
FONAME = None
if len(sys.argv) >= 2:
    FINAME = sys.argv[1]
if len(sys.argv) >= 3:
    FONAME = sys.argv[2]

FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
if FONAME is None:
    FOUT = sys.stdout
else:
    FOUT = codecs.open(FONAME, "w", "utf-8")


class GadictParser:

    SEPARATOR_RE = regex.compile(u"^__$")
    EMPTY_RE = regex.compile( u"^$" )
    HEADWORD_ATTR_RE = regex.compile( u"^ " )
    HEADWORD_RE = regex.compile( u"^(\\p{L}.*)$" )

    def __init__(self, stream):
        self.stream = stream
        self.lineno = 0

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                return wlist
            self.lineno += 1
            m = self.SEPARATOR_RE.match(line)
            if not m:
                continue

            line = self.stream.readline()
            if len(line) == 0:
                return wlist
            self.lineno += 1
            m = self.EMPTY_RE.match(line)
            if not m:
                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))

            while True:
                line = self.stream.readline()
                if len(line) == 0:
                    return wlist
                self.lineno += 1
                m = self.HEADWORD_ATTR_RE.match(line)
                if m:
                    continue
                line = line.strip()
                if len(line) == 0:
                    break
                m = self.HEADWORD_RE.match(line)
                if not m:
                    raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
                wlist.append(line)
        return wlist

try:
    parser = GadictParser(FIN)
    for headword in parser.parse():
        FOUT.write(headword)
        FOUT.write("\n")
except Exception as ex:
    print("{}:{}".format(FINAME, str(ex)))
    raise ex
finally:
    FIN.close()
    FOUT.close()