py/gadict_headwords.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Sun, 28 Nov 2021 12:03:15 +0200
changeset 1331 005b99b02d6c
parent 757 5417f2102dc5
permissions -rw-r--r--
Removed BOM marker.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
import sys
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
import codecs
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
import io
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 720
diff changeset
     5
import re
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
FINAME = None
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
FONAME = None
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
if len(sys.argv) >= 2:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
    FINAME = sys.argv[1]
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
if len(sys.argv) >= 3:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
    FONAME = sys.argv[2]
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
FIN = io.open(FINAME, mode='r', buffering=1, encoding="utf-8")
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
if FONAME is None:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
    FOUT = sys.stdout
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
else:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
    FOUT = codecs.open(FONAME, "w", "utf-8")
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
class GadictParser:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
757
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 720
diff changeset
    23
    SEPARATOR_RE = re.compile(u"^__$", re.UNICODE)
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 720
diff changeset
    24
    EMPTY_RE = re.compile( u"^$" )
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 720
diff changeset
    25
    HEADWORD_ATTR_RE = re.compile( u"^ " )
5417f2102dc5 Switch to built-in `re` Python module over `regex`.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 720
diff changeset
    26
    HEADWORD_RE = re.compile(u"^(\\w.*)$", re.UNICODE)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
    def __init__(self, stream):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
        self.stream = stream
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
        self.lineno = 0
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
    def parse(self):
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
        wlist = []
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
        while True:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
            if len(line) == 0:
720
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    37
                return wlist
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
            self.lineno += 1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
            m = self.SEPARATOR_RE.match(line)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
            if not m:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
                continue
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
            line = self.stream.readline()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
            if len(line) == 0:
720
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    45
                return wlist
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
            self.lineno += 1
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
            m = self.EMPTY_RE.match(line)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
            if not m:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
720
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    51
            while True:
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    52
                line = self.stream.readline()
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    53
                if len(line) == 0:
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    54
                    return wlist
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    55
                self.lineno += 1
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    56
                m = self.HEADWORD_ATTR_RE.match(line)
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    57
                if m:
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    58
                    continue
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    59
                line = line.strip()
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    60
                if len(line) == 0:
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    61
                    break
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    62
                m = self.HEADWORD_RE.match(line)
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    63
                if not m:
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    64
                    raise Exception("{:d}: '{:s}' is not a headword\n".format(self.lineno, line))
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    65
                wlist.append(line)
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    66
        return wlist
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    67
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    68
try:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    69
    parser = GadictParser(FIN)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    70
    for headword in parser.parse():
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    71
        FOUT.write(headword)
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    72
        FOUT.write("\n")
720
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    73
except Exception as ex:
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    74
    print("{}:{}".format(FINAME, str(ex)))
b5a4b476eddf Include all headwords into list not just first.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 643
diff changeset
    75
    raise ex
643
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    76
finally:
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
    FIN.close()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    78
    FOUT.close()
c2c32f45dde6 Search rare words in gadict.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79