Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
wtpsplit==1.3.0
wunsen==0.0.3
word2word>=1.0.0,<2
budoux==0.7.0
52 changes: 52 additions & 0 deletions pythainlp/tokenize/budoux.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
# SPDX-FileType: SOURCE
# SPDX-License-Identifier: Apache-2.0
"""
Wrapper for BudouX tokenizer (https://github.com/google/budoux)

This module provides a small, defensive wrapper around the Python
`budoux` package. The wrapper lazy-imports the package so importing
`pythainlp.tokenize` will not fail if `budoux` is not installed. When
used and `budoux` is missing, a clear ImportError is raised with an
installation hint.
"""
from typing import List

_parser = None


def _init_parser():
"""Lazy initialize and return a budoux parser instance.

Raises ImportError when `budoux` is not installed, and RuntimeError
if the installed budoux does not expose a supported API.
"""
try:
import budoux
except Exception as exc: # pragma: no cover - defensive import
raise ImportError(
"budoux is not installed. Install it with: pip install budoux"
) from exc

return budoux.load_default_thai_parser()


def segment(text: str) -> List[str]:
"""Segment `text` into tokens using budoux.

The function returns a list of strings. If `budoux` is not available
the function raises ImportError with an installation hint.
"""
if not text or not isinstance(text, str):
return []

global _parser
if _parser is None:
_parser = _init_parser()

parser = _parser

result = parser.parse(text)

return result
9 changes: 8 additions & 1 deletion pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,8 @@ def word_tokenize(
* *tltk* - wrapper for
`TLTK <https://pypi.org/project/tltk/>`_.,
maximum collocation approach
* *budoux* - wrapper for
`budoux <https://github.com/google/budoux>`_.
:Note:
- The **custom_dict** parameter only works for \
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
Expand Down Expand Up @@ -227,7 +229,8 @@ def word_tokenize(
"nercut",
"sefr_cut",
"tltk",
"oskut"
"oskut",
"budoux",
):
raise NotImplementedError(
f"The {engine} engine does not support custom dictionaries."
Expand Down Expand Up @@ -264,6 +267,10 @@ def word_tokenize(
elif engine == "icu":
from pythainlp.tokenize.pyicu import segment

segments = segment(text)
elif engine == "budoux":
from pythainlp.tokenize.budoux import segment

segments = segment(text)
elif engine == "nercut":
from pythainlp.tokenize.nercut import segment
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
"thai_nner": ["thai_nner"],
"thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY],
"thai2rom": [NUMPY, "torch>=1.0.0"],
"budoux": ["budoux>=0.7.0"],
"translate": [
'fairseq>=0.10.0,<0.13;python_version<"3.11"',
'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"',
Expand Down Expand Up @@ -155,6 +156,7 @@
"wtpsplit>=1.0.1",
"wunsen>=0.0.3",
"word2word>=1.0.0",
"budoux>=0.7.0",
],
}

Expand Down
5 changes: 5 additions & 0 deletions tests/extra/testx_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,3 +333,8 @@ def test_sefr_cut(self):
class WordTokenizeTLTKTestCase(unittest.TestCase):
def test_word_tokenize_tltk(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk"))


class WordTokenizeBudouxTestCase(unittest.TestCase):
def test_word_tokenize_budoux(self):
self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))
Loading