py/gadict_spellcheck.py
author Oleksandr Gavenko <gavenkoa@gmail.com>
Thu, 10 Nov 2016 16:27:42 +0200
changeset 660 5305f170237d
child 662 a0ef60715efe
permissions -rw-r--r--
Add spell checker based on hunspell.
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
660
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     1
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     2
import sys
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     3
import io
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     4
import regex
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     5
import enchant
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     6
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     7
################################################################
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     8
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
     9
class EofReached (Exception):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    10
    pass
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    11
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    12
class EmptyChecker:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    13
    def __init__(self):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    14
        pass
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    15
    def check(self, word):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    16
        pass
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    17
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    18
class GadictSpellChecker:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    19
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    20
    SEPARATOR_RE = regex.compile(u"^__$")
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    21
    EMPTY_RE = regex.compile( u"^$" )
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    22
    HEADWORD_ATTR_RE = regex.compile( u"^ " )
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    23
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    24
    def _get_checker(self, lang):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    25
        try:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    26
            dic = enchant.Dict(lang)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    27
        except enchant.errors.DictNotFoundError:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    28
            print("Dictionary '{:s}' is not found...".format(lang))
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    29
            dic = EmptyChecker()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    30
        return dic
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    31
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    32
    def __init__(self, stream, fname):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    33
        self.stream = stream
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    34
        self.fname = fname
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    35
        self.lineno = 0
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    36
        self.dict_us = self._get_checker('en_US')
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    37
        self.dict_gb = self._get_checker('en_GB')
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    38
        self.dict_au = self._get_checker('en_AU')
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    39
        self.dict_ca = self._get_checker('en_CA')
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    40
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    41
    def _readline(self):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    42
        line = self.stream.readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    43
        if len(line) == 0:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    44
            raise EofReached
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    45
        self.lineno += 1
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    46
        return line
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    47
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    48
    def _check_body(self):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    49
        while True:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    50
            line = self._readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    51
            m = self.SEPARATOR_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    52
            if not m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    53
                continue
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    54
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    55
            line = self._readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    56
            m = self.EMPTY_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    57
            if not m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    58
                raise Exception("Line {:d}: '{:s}' is not empty line\n".format(self.lineno, line))
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    59
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    60
            while True:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    61
                line = self._readline()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    62
                m = self.EMPTY_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    63
                if m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    64
                    break
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    65
                m = self.HEADWORD_ATTR_RE.match(line)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    66
                if m:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    67
                    continue
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    68
                line = line.strip()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    69
                for word in regex.split("[ ,]+", line):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    70
                    if self.dict_us.check(word) or self.dict_gb.check(word) or self.dict_au.check(word) or self.dict_ca.check(word):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    71
                        continue
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    72
                    print("""{:s}:{:d}: "{:s}" is misspelled""".format(self.fname, self.lineno, word))
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    73
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    74
    def check(self):
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    75
        try:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    76
            self._check_body()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    77
        except EofReached:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    78
            pass
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    79
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    80
################################################################
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    81
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    82
if len(sys.argv) < 2:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    83
    raise Exception("Please, supply path to dictionary...")
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    84
if len(sys.argv) > 2:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    85
    raise Exception("Pnly one argument necessary...")
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    86
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    87
FINAME = sys.argv[1]
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    88
with io.open(FINAME, mode='r', buffering=1, encoding="utf-8") as FIN:
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    89
    checker = GadictSpellChecker(FIN, FINAME)
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    90
    checker.check()
5305f170237d Add spell checker based on hunspell.
Oleksandr Gavenko <gavenkoa@gmail.com>
parents:
diff changeset
    91