Updated Emacs gaphrase to support new format with unique ids.
Id is necessary to keep progress in Anki on deck subsequent import.
import sys
import codecs
import io
import re
FINAME = None
FONAME = None
if len(sys.argv) >= 2:
FINAME = sys.argv[1]
if len(sys.argv) >= 3:
FONAME = sys.argv[2]
FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
if FONAME is None:
FOUT = sys.stdout
else:
FOUT = codecs.open(FONAME, "w", "utf-8")
class GadictParser:
SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
EMPTY_RE = re.compile( u"^$" )
HEADWORD_ATTR_RE = re.compile( u"^ " )
HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE)
def __init__(self, stream):
self.stream = stream
self.lineno = 0
def parse(self):
wlist = []
while True:
line = self.stream.readline()
if len(line) == 0:
return wlist
self.lineno += 1
m = self.SEPARATOR_RE.match(line)
if not m:
continue
line = self.stream.readline()
if len(line) == 0:
return wlist
self.lineno += 1
m = self.EMPTY_RE.match(line)
if not m:
raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
while True:
line = self.stream.readline()
if len(line) == 0:
return wlist
self.lineno += 1
m = self.HEADWORD_ATTR_RE.match(line)
if m:
continue
line = line.strip()
if len(line) == 0:
break
m = self.HEADWORD_RE.match(line)
if not m:
raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
wlist.append(line)
return wlist
try:
parser = GadictParser(FIN)
for headword in parser.parse():
FOUT.write(headword)
FOUT.write("\n")
except Exception as ex:
print("{}:{}".format(FINAME, str(ex)))
raise ex
finally:
FIN.close()
FOUT.close()