diff --git a/docker_requirements.txt b/docker_requirements.txt
index f9ac7e93d..367c3916a 100644
--- a/docker_requirements.txt
+++ b/docker_requirements.txt
@@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
wtpsplit==1.3.0
wunsen==0.0.3
word2word>=1.0.0,<2
+budoux==0.7.0
diff --git a/pythainlp/tokenize/budoux.py b/pythainlp/tokenize/budoux.py
new file mode 100644
index 000000000..7bddb308d
--- /dev/null
+++ b/pythainlp/tokenize/budoux.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper for BudouX tokenizer (https://github.com/google/budoux)
+
+This module provides a small, defensive wrapper around the Python
+`budoux` package. The wrapper lazy-imports the package so importing
+`pythainlp.tokenize` will not fail if `budoux` is not installed. When
+used and `budoux` is missing, a clear ImportError is raised with an
+installation hint.
+"""
+from typing import List
+
+_parser = None
+
+
+def _init_parser():
+ """Lazy initialize and return a budoux parser instance.
+
+ Raises ImportError when `budoux` is not installed, and RuntimeError
+ if the installed budoux does not expose a supported API.
+ """
+ try:
+ import budoux
+ except Exception as exc: # pragma: no cover - defensive import
+ raise ImportError(
+ "budoux is not installed. Install it with: pip install budoux"
+ ) from exc
+
+ return budoux.load_default_thai_parser()
+
+
+def segment(text: str) -> List[str]:
+ """Segment `text` into tokens using budoux.
+
+ The function returns a list of strings. If `budoux` is not available
+ the function raises ImportError with an installation hint.
+ """
+ if not text or not isinstance(text, str):
+ return []
+
+ global _parser
+ if _parser is None:
+ _parser = _init_parser()
+
+ parser = _parser
+
+ result = parser.parse(text)
+
+ return result
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
index bcc166c0d..3ae208904 100644
--- a/pythainlp/tokenize/core.py
+++ b/pythainlp/tokenize/core.py
@@ -152,6 +152,8 @@ def word_tokenize(
* *tltk* - wrapper for
`TLTK `_.,
maximum collocation approach
+ * *budoux* - wrapper for
+ `budoux `_.
:Note:
- The **custom_dict** parameter only works for \
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
@@ -227,7 +229,8 @@ def word_tokenize(
"nercut",
"sefr_cut",
"tltk",
- "oskut"
+ "oskut",
+ "budoux",
):
raise NotImplementedError(
f"The {engine} engine does not support custom dictionaries."
@@ -264,6 +267,10 @@ def word_tokenize(
elif engine == "icu":
from pythainlp.tokenize.pyicu import segment
+ segments = segment(text)
+ elif engine == "budoux":
+ from pythainlp.tokenize.budoux import segment
+
segments = segment(text)
elif engine == "nercut":
from pythainlp.tokenize.nercut import segment
diff --git a/setup.py b/setup.py
index ce8092e23..324b4caa8 100644
--- a/setup.py
+++ b/setup.py
@@ -86,6 +86,7 @@
"thai_nner": ["thai_nner"],
"thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY],
"thai2rom": [NUMPY, "torch>=1.0.0"],
+ "budoux": ["budoux>=0.7.0"],
"translate": [
'fairseq>=0.10.0,<0.13;python_version<"3.11"',
'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"',
@@ -155,6 +156,7 @@
"wtpsplit>=1.0.1",
"wunsen>=0.0.3",
"word2word>=1.0.0",
+ "budoux>=0.7.0",
],
}
diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py
index d089c2748..d7b6a8287 100644
--- a/tests/extra/testx_tokenize.py
+++ b/tests/extra/testx_tokenize.py
@@ -333,3 +333,8 @@ def test_sefr_cut(self):
class WordTokenizeTLTKTestCase(unittest.TestCase):
def test_word_tokenize_tltk(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk"))
+
+
+class WordTokenizeBudouxTestCase(unittest.TestCase):
+ def test_word_tokenize_budoux(self):
+ self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))