Skip to content

Commit bbc1c2c

Browse files
authored
Add budoux (#1161)
* Add budoux
1 parent 6022bf7 commit bbc1c2c

File tree

5 files changed

+68
-1
lines changed

5 files changed

+68
-1
lines changed

docker_requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
3535
wtpsplit==1.3.0
3636
wunsen==0.0.3
3737
word2word>=1.0.0,<2
38+
budoux==0.7.0

pythainlp/tokenize/budoux.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# -*- coding: utf-8 -*-
2+
# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3+
# SPDX-FileType: SOURCE
4+
# SPDX-License-Identifier: Apache-2.0
5+
"""
6+
Wrapper for BudouX tokenizer (https://github.com/google/budoux)
7+
8+
This module provides a small, defensive wrapper around the Python
9+
`budoux` package. The wrapper lazy-imports the package so importing
10+
`pythainlp.tokenize` will not fail if `budoux` is not installed. When
11+
used and `budoux` is missing, a clear ImportError is raised with an
12+
installation hint.
13+
"""
14+
from typing import List
15+
16+
_parser = None
17+
18+
19+
def _init_parser():
20+
"""Lazy initialize and return a budoux parser instance.
21+
22+
Raises ImportError when `budoux` is not installed, and RuntimeError
23+
if the installed budoux does not expose a supported API.
24+
"""
25+
try:
26+
import budoux
27+
except Exception as exc: # pragma: no cover - defensive import
28+
raise ImportError(
29+
"budoux is not installed. Install it with: pip install budoux"
30+
) from exc
31+
32+
return budoux.load_default_thai_parser()
33+
34+
35+
def segment(text: str) -> List[str]:
36+
"""Segment `text` into tokens using budoux.
37+
38+
The function returns a list of strings. If `budoux` is not available
39+
the function raises ImportError with an installation hint.
40+
"""
41+
if not text or not isinstance(text, str):
42+
return []
43+
44+
global _parser
45+
if _parser is None:
46+
_parser = _init_parser()
47+
48+
parser = _parser
49+
50+
result = parser.parse(text)
51+
52+
return result

pythainlp/tokenize/core.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,8 @@ def word_tokenize(
152152
* *tltk* - wrapper for
153153
`TLTK <https://pypi.org/project/tltk/>`_.,
154154
maximum collocation approach
155+
* *budoux* - wrapper for
156+
`budoux <https://github.com/google/budoux>`_.
155157
:Note:
156158
- The **custom_dict** parameter only works for \
157159
*deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
@@ -227,7 +229,8 @@ def word_tokenize(
227229
"nercut",
228230
"sefr_cut",
229231
"tltk",
230-
"oskut"
232+
"oskut",
233+
"budoux",
231234
):
232235
raise NotImplementedError(
233236
f"The {engine} engine does not support custom dictionaries."
@@ -264,6 +267,10 @@ def word_tokenize(
264267
elif engine == "icu":
265268
from pythainlp.tokenize.pyicu import segment
266269

270+
segments = segment(text)
271+
elif engine == "budoux":
272+
from pythainlp.tokenize.budoux import segment
273+
267274
segments = segment(text)
268275
elif engine == "nercut":
269276
from pythainlp.tokenize.nercut import segment

setup.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@
8686
"thai_nner": ["thai_nner"],
8787
"thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY],
8888
"thai2rom": [NUMPY, "torch>=1.0.0"],
89+
"budoux": ["budoux>=0.7.0"],
8990
"translate": [
9091
'fairseq>=0.10.0,<0.13;python_version<"3.11"',
9192
'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"',
@@ -155,6 +156,7 @@
155156
"wtpsplit>=1.0.1",
156157
"wunsen>=0.0.3",
157158
"word2word>=1.0.0",
159+
"budoux>=0.7.0",
158160
],
159161
}
160162

tests/extra/testx_tokenize.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -333,3 +333,8 @@ def test_sefr_cut(self):
333333
class WordTokenizeTLTKTestCase(unittest.TestCase):
334334
def test_word_tokenize_tltk(self):
335335
self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk"))
336+
337+
338+
class WordTokenizeBudouxTestCase(unittest.TestCase):
339+
def test_word_tokenize_budoux(self):
340+
self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))

0 commit comments

Comments
 (0)