py/gadict_headwords.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sat, 20 Apr 2019 17:41:33 +0300
changeset 1145 79b55cca9f44
parent 757 5417f2102dc5
permissions -rw-r--r--
Updated Emacs gaphrase to support new format with unique ids. Id is necessary to keep progress in Anki on deck subsequent import.


import sys
import codecs
import io
import re

FINAME = None
FONAME = None
if len(sys.argv) >= 2:
    FINAME = sys.argv[1]
if len(sys.argv) >= 3:
    FONAME = sys.argv[2]

FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
if FONAME is None:
    FOUT = sys.stdout
else:
    FOUT = codecs.open(FONAME, "w", "utf-8")


class GadictParser:

    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
    EMPTY_RE = re.compile( u"^$" )
    HEADWORD_ATTR_RE = re.compile( u"^ " )
    HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE)

    def __init__(self, stream):
        self.stream = stream
        self.lineno = 0

    def parse(self):
        wlist = []
        while True:
            line = self.stream.readline()
            if len(line) == 0:
                return wlist
            self.lineno += 1
            m = self.SEPARATOR_RE.match(line)
            if not m:
                continue

            line = self.stream.readline()
            if len(line) == 0:
                return wlist
            self.lineno += 1
            m = self.EMPTY_RE.match(line)
            if not m:
                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))

            while True:
                line = self.stream.readline()
                if len(line) == 0:
                    return wlist
                self.lineno += 1
                m = self.HEADWORD_ATTR_RE.match(line)
                if m:
                    continue
                line = line.strip()
                if len(line) == 0:
                    break
                m = self.HEADWORD_RE.match(line)
                if not m:
                    raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
                wlist.append(line)
        return wlist

try:
    parser = GadictParser(FIN)
    for headword in parser.parse():
        FOUT.write(headword)
        FOUT.write("\n")
except Exception as ex:
    print("{}:{}".format(FINAME, str(ex)))
    raise ex
finally:
    FIN.close()
    FOUT.close()