File tree Expand file tree Collapse file tree 5 files changed +68
-1
lines changed Expand file tree Collapse file tree 5 files changed +68
-1
lines changed Original file line number Diff line number Diff line change @@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
3535wtpsplit==1.3.0
3636wunsen==0.0.3
3737word2word>=1.0.0,<2
38+ budoux==0.7.0
Original file line number Diff line number Diff line change 1+ # -*- coding: utf-8 -*-
2+ # SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
3+ # SPDX-FileType: SOURCE
4+ # SPDX-License-Identifier: Apache-2.0
5+ """
6+ Wrapper for BudouX tokenizer (https://github.com/google/budoux)
7+
8+ This module provides a small, defensive wrapper around the Python
9+ `budoux` package. The wrapper lazy-imports the package so importing
10+ `pythainlp.tokenize` will not fail if `budoux` is not installed. When
11+ used and `budoux` is missing, a clear ImportError is raised with an
12+ installation hint.
13+ """
14+ from typing import List
15+
16+ _parser = None
17+
18+
19+ def _init_parser ():
20+ """Lazy initialize and return a budoux parser instance.
21+
22+ Raises ImportError when `budoux` is not installed, and RuntimeError
23+ if the installed budoux does not expose a supported API.
24+ """
25+ try :
26+ import budoux
27+ except Exception as exc : # pragma: no cover - defensive import
28+ raise ImportError (
29+ "budoux is not installed. Install it with: pip install budoux"
30+ ) from exc
31+
32+ return budoux .load_default_thai_parser ()
33+
34+
35+ def segment (text : str ) -> List [str ]:
36+ """Segment `text` into tokens using budoux.
37+
38+ The function returns a list of strings. If `budoux` is not available
39+ the function raises ImportError with an installation hint.
40+ """
41+ if not text or not isinstance (text , str ):
42+ return []
43+
44+ global _parser
45+ if _parser is None :
46+ _parser = _init_parser ()
47+
48+ parser = _parser
49+
50+ result = parser .parse (text )
51+
52+ return result
Original file line number Diff line number Diff line change @@ -152,6 +152,8 @@ def word_tokenize(
152152 * *tltk* - wrapper for
153153 `TLTK <https://pypi.org/project/tltk/>`_.,
154154 maximum collocation approach
155+ * *budoux* - wrapper for
156+ `budoux <https://github.com/google/budoux>`_.
155157 :Note:
156158 - The **custom_dict** parameter only works for \
157159 *deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
@@ -227,7 +229,8 @@ def word_tokenize(
227229 "nercut" ,
228230 "sefr_cut" ,
229231 "tltk" ,
230- "oskut"
232+ "oskut" ,
233+ "budoux" ,
231234 ):
232235 raise NotImplementedError (
233236 f"The { engine } engine does not support custom dictionaries."
@@ -264,6 +267,10 @@ def word_tokenize(
264267 elif engine == "icu" :
265268 from pythainlp .tokenize .pyicu import segment
266269
270+ segments = segment (text )
271+ elif engine == "budoux" :
272+ from pythainlp .tokenize .budoux import segment
273+
267274 segments = segment (text )
268275 elif engine == "nercut" :
269276 from pythainlp .tokenize .nercut import segment
Original file line number Diff line number Diff line change 8686 "thai_nner" : ["thai_nner" ],
8787 "thai2fit" : ["emoji>=0.5.1" , "gensim>=4.0.0" , NUMPY ],
8888 "thai2rom" : [NUMPY , "torch>=1.0.0" ],
89+ "budoux" : ["budoux>=0.7.0" ],
8990 "translate" : [
9091 'fairseq>=0.10.0,<0.13;python_version<"3.11"' ,
9192 'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"' ,
155156 "wtpsplit>=1.0.1" ,
156157 "wunsen>=0.0.3" ,
157158 "word2word>=1.0.0" ,
159+ "budoux>=0.7.0" ,
158160 ],
159161}
160162
Original file line number Diff line number Diff line change @@ -333,3 +333,8 @@ def test_sefr_cut(self):
333333class WordTokenizeTLTKTestCase (unittest .TestCase ):
334334 def test_word_tokenize_tltk (self ):
335335 self .assertIsNotNone (word_tokenize (TEXT_1 , engine = "tltk" ))
336+
337+
338+ class WordTokenizeBudouxTestCase (unittest .TestCase ):
339+ def test_word_tokenize_budoux (self ):
340+ self .assertIsNotNone (word_tokenize (TEXT_1 , engine = "budoux" ))
You can’t perform that action at this time.
0 commit comments