Added new articles.
import sys
import codecs
import io
import re
FINAME = None
FONAME = None
if len(sys.argv) >= 2:
FINAME = sys.argv[1]
if len(sys.argv) >= 3:
FONAME = sys.argv[2]
FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
if FONAME is None:
FOUT = sys.stdout
else:
FOUT = codecs.open(FONAME, "w", "utf-8")
class GadictParser:
SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
EMPTY_RE = re.compile( u"^$" )
HEADWORD_ATTR_RE = re.compile( u"^ " )
HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE)
def __init__(self, stream):
self.stream = stream
self.lineno = 0
def parse(self):
wlist = []
while True:
line = self.stream.readline()
if len(line) == 0:
return wlist
self.lineno += 1
m = self.SEPARATOR_RE.match(line)
if not m:
continue
line = self.stream.readline()
if len(line) == 0:
return wlist
self.lineno += 1
m = self.EMPTY_RE.match(line)
if not m:
raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
while True:
line = self.stream.readline()
if len(line) == 0:
return wlist
self.lineno += 1
m = self.HEADWORD_ATTR_RE.match(line)
if m:
continue
line = line.strip()
if len(line) == 0:
break
m = self.HEADWORD_RE.match(line)
if not m:
raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
wlist.append(line)
return wlist
try:
parser = GadictParser(FIN)
for headword in parser.parse():
FOUT.write(headword)
FOUT.write("\n")
except Exception as ex:
print("{}:{}".format(FINAME, str(ex)))
raise ex
finally:
FIN.close()
FOUT.close()