py/gadict_spellcheck.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Mon, 27 Feb 2023 00:55:27 +0200
changeset 1342 d6413e1d20b0
parent 694 4457721a1a13
permissions -rw-r--r--
Added new articles.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
import sys
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
import io
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
import regex
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
import enchant
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
################################################################
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
class EofReached (Exception):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
    pass
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
class GadictSpellChecker:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
    SEPARATOR_RE = regex.compile(u"^__$")
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
    EMPTY_RE = regex.compile( u"^$" )
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
    HEADWORD_ATTR_RE = regex.compile( u"^ " )
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
662
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    18
    def _add_checker(self, lang):
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
        try:
662
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    20
            self.dicts.append(enchant.Dict(lang))
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
        except enchant.errors.DictNotFoundError:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
            print("Dictionary '{:s}' is not found...".format(lang))
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    24
    def __init__(self, stream, fname):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
        self.stream = stream
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
        self.fname = fname
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
        self.lineno = 0
662
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    28
        self.dicts = []
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    29
        self._add_checker('en_US')
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    30
        self._add_checker('en_GB')
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    31
        # self._add_checker('en_AU')
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    32
        # self._add_checker('en_CA')
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
    def _readline(self):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
        line = self.stream.readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
        if len(line) == 0:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
            raise EofReached
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
        self.lineno += 1
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
        return line
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
694
4457721a1a13 Simplify function name.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 693
diff changeset
    41
    def _check(self):
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
        while True:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
            line = self._readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
            m = self.SEPARATOR_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
            if not m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
                continue
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
            line = self._readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
            m = self.EMPTY_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
            if not m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
            while True:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    54
                line = self._readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    55
                m = self.EMPTY_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    56
                if m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
                    break
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
                m = self.HEADWORD_ATTR_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
                if m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
                    continue
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    61
                line = line.strip()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    62
                for word in regex.split("[ ,]+", line):
662
a0ef60715efe Generalize dictionary management. Disable Canadian and Australian variants.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 660
diff changeset
    63
                    if any([dic.check(word) for dic in self.dicts]):
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    64
                        continue
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    65
                    print("""{:s}:{:d}: "{:s}" is misspelled""".format(self.fname, self.lineno, word))
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    66
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    67
    def check(self):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    68
        try:
694
4457721a1a13 Simplify function name.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 693
diff changeset
    69
            self._check()
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    70
        except EofReached:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    71
            pass
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    72
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    73
################################################################
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    74
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    75
if len(sys.argv) < 2:
692
8a754a13e3b3 Fix typo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 691
diff changeset
    76
    raise Exception("Please, supply path to file...")
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
if len(sys.argv) > 2:
693
7c50e19cd04f Fix typo.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents: 692
diff changeset
    78
    raise Exception("Only one argument is necessary...")
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    80
FINAME = sys.argv[1]
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    81
with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as FIN:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    82
    checker = GadictSpellChecker(FIN, FINAME)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    83
    checker.check()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    84