diff --git a/docker_requirements.txt b/docker_requirements.txt index f9ac7e93d..367c3916a 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3 wtpsplit==1.3.0 wunsen==0.0.3 word2word>=1.0.0,<2 +budoux==0.7.0 diff --git a/pythainlp/tokenize/budoux.py b/pythainlp/tokenize/budoux.py new file mode 100644 index 000000000..7bddb308d --- /dev/null +++ b/pythainlp/tokenize/budoux.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +""" +Wrapper for BudouX tokenizer (https://github.com/google/budoux) + +This module provides a small, defensive wrapper around the Python +`budoux` package. The wrapper lazy-imports the package so importing +`pythainlp.tokenize` will not fail if `budoux` is not installed. When +used and `budoux` is missing, a clear ImportError is raised with an +installation hint. +""" +from typing import List + +_parser = None + + +def _init_parser(): + """Lazy initialize and return a budoux parser instance. + + Raises ImportError when `budoux` is not installed, and RuntimeError + if the installed budoux does not expose a supported API. + """ + try: + import budoux + except Exception as exc: # pragma: no cover - defensive import + raise ImportError( + "budoux is not installed. Install it with: pip install budoux" + ) from exc + + return budoux.load_default_thai_parser() + + +def segment(text: str) -> List[str]: + """Segment `text` into tokens using budoux. + + The function returns a list of strings. If `budoux` is not available + the function raises ImportError with an installation hint. + """ + if not text or not isinstance(text, str): + return [] + + global _parser + if _parser is None: + _parser = _init_parser() + + parser = _parser + + result = parser.parse(text) + + return result diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py index bcc166c0d..3ae208904 100644 --- a/pythainlp/tokenize/core.py +++ b/pythainlp/tokenize/core.py @@ -152,6 +152,8 @@ def word_tokenize( * *tltk* - wrapper for `TLTK `_., maximum collocation approach + * *budoux* - wrapper for + `budoux `_. :Note: - The **custom_dict** parameter only works for \ *deepcut*, *longest*, *newmm*, and *newmm-safe* engines. @@ -227,7 +229,8 @@ def word_tokenize( "nercut", "sefr_cut", "tltk", - "oskut" + "oskut", + "budoux", ): raise NotImplementedError( f"The {engine} engine does not support custom dictionaries." @@ -264,6 +267,10 @@ def word_tokenize( elif engine == "icu": from pythainlp.tokenize.pyicu import segment + segments = segment(text) + elif engine == "budoux": + from pythainlp.tokenize.budoux import segment + segments = segment(text) elif engine == "nercut": from pythainlp.tokenize.nercut import segment diff --git a/setup.py b/setup.py index ce8092e23..324b4caa8 100644 --- a/setup.py +++ b/setup.py @@ -86,6 +86,7 @@ "thai_nner": ["thai_nner"], "thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY], "thai2rom": [NUMPY, "torch>=1.0.0"], + "budoux": ["budoux>=0.7.0"], "translate": [ 'fairseq>=0.10.0,<0.13;python_version<"3.11"', 'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"', @@ -155,6 +156,7 @@ "wtpsplit>=1.0.1", "wunsen>=0.0.3", "word2word>=1.0.0", + "budoux>=0.7.0", ], } diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py index d089c2748..d7b6a8287 100644 --- a/tests/extra/testx_tokenize.py +++ b/tests/extra/testx_tokenize.py @@ -333,3 +333,8 @@ def test_sefr_cut(self): class WordTokenizeTLTKTestCase(unittest.TestCase): def test_word_tokenize_tltk(self): self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk")) + + +class WordTokenizeBudouxTestCase(unittest.TestCase): + def test_word_tokenize_budoux(self): + self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))