diff --git a/pospell.py b/pospell.py index 8db7fdf..6185f3f 100644 --- a/pospell.py +++ b/pospell.py @@ -5,6 +5,7 @@ import subprocess import sys import tempfile +from unicodedata import category from contextlib import redirect_stderr from itertools import chain from pathlib import Path @@ -16,21 +17,14 @@ import polib from docutils.parsers.rst import roles from docutils.utils import new_document - +from hunspell import Hunspell +from nltk.tokenize import TweetTokenizer import regex __version__ = "1.0.3" DEFAULT_DROP_CAPITALIZED = {"fr": True, "fr_FR": True} -try: - HUNSPELL_VERSION = subprocess.check_output( - ["hunspell", "--version"], universal_newlines=True - ).split("\n")[0] -except FileNotFoundError: - print("hunspell not found, please install hunspell.", file=sys.stderr) - exit(1) - class DummyNodeClass(docutils.nodes.Inline, docutils.nodes.TextElement): pass @@ -130,6 +124,8 @@ def clear(po_path, line, drop_capitalized=False): line = regex.sub(r"\s+", " ", line) to_drop = { r'', + r"", + r"\w*@\w*", # Emails and various handles (docs@, @sizeof, ...) # Strip accronyms r"\b[\w-]*\p{Uppercase}{2,}[0-9.\w-]*\b", r"---?", # -- and --- separators to be ignored @@ -221,9 +217,7 @@ def parse_args(): help="More output, use -vv, -vvv, and so on.", ) parser.add_argument( - "--version", - action="version", - version="%(prog)s " + __version__ + " using hunspell: " + HUNSPELL_VERSION, + "--version", action="version", version="%(prog)s " + __version__, ) parser.add_argument("--debug", action="store_true") parser.add_argument("-p", "--personal-dict", type=str) @@ -241,6 +235,15 @@ def parse_args(): return args +def should_ignore(word): + if all(category(c)[0] in "PZ" for c in word): + # print(f"Skipping {word}") + return True + if any(category(c)[0] == "N" for c in word): + return True + return False + + def spell_check( po_files, personal_dict, language, drop_capitalized=False, debug_only=False ): @@ -252,32 +255,31 @@ def spell_check( """ errors = 0 personal_dict_arg = ["-p", personal_dict] if personal_dict else [] - with tempfile.TemporaryDirectory() as tmpdirname: - tmpdir = Path(tmpdirname) - for po_file in po_files: - if debug_only: - print(po_to_text(str(po_file), drop_capitalized)) - continue - (tmpdir / po_file.name).write_text( - po_to_text(str(po_file), drop_capitalized) - ) - try: - output = subprocess.check_output( - ["hunspell", "-d", language] - + personal_dict_arg - + ["-u3", str(tmpdir / po_file.name)], - universal_newlines=True, - ) - except subprocess.CalledProcessError: - return -1 - for line in output.split("\n"): - match = regex.match( - r"(?P.*):(?P[0-9]+): Locate: (?P.*) \| Try: .*$", - line, - ) - if match: + hunspell = Hunspell(language, hunspell_data_dir="/usr/share/hunspell") + with open(personal_dict) as personal_dict_file: + whitelist = {line.strip() for line in personal_dict_file.readlines()} + tknzr = TweetTokenizer() + for po_file in po_files: + text_to_check = po_to_text(str(po_file), drop_capitalized) + if debug_only: + print(text_to_check) + continue + for line_no, line in enumerate(text_to_check.split("\n")): + line = line.replace("’", "'") + for word in tknzr.tokenize(line): + if len(word) == 1: + continue + if word.lower() in whitelist: + continue + if not hunspell.spell(word) and not should_ignore(word): errors += 1 - print(po_file, match.group("line"), match.group("error"), sep=":") + suggestion = hunspell.suggest(word) + if not word: + print(f"{po_file}:{line_no}: {word!r}") + else: + print( + f"{po_file}:{line_no}: {word!r}, suggestions: {', '.join(suggestion)}" + ) return errors diff --git a/setup.py b/setup.py index a89d5ab..b07cb74 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ extras_require={ "dev": ["bandit", "black", "detox", "flake8", "isort", "mypy", "pylint"] }, - install_requires=["polib", "docutils>=0.11", "regex"], + install_requires=["polib", "docutils>=0.11", "regex", "cyhunspell", "nltk"], license="MIT license", keywords="po spell gettext reStructuredText check sphinx translation", classifiers=[