Add budoux (#1161)

wannaphong · web-flow · commit bbc1c2ce89a5 · 2025-10-24T18:08:06.000+07:00
* Add budoux
diff --git a/docker_requirements.txt b/docker_requirements.txt
@@ -35,3 +35,4 @@ ufal.chu-liu-edmonds==1.0.3
 wtpsplit==1.3.0
 wunsen==0.0.3
 word2word>=1.0.0,<2
+budoux==0.7.0
diff --git a/pythainlp/tokenize/budoux.py b/pythainlp/tokenize/budoux.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+# SPDX-FileCopyrightText: 2016-2025 PyThaiNLP Project
+# SPDX-FileType: SOURCE
+# SPDX-License-Identifier: Apache-2.0
+"""
+Wrapper for BudouX tokenizer (https://github.com/google/budoux)
+
+This module provides a small, defensive wrapper around the Python
+`budoux` package. The wrapper lazy-imports the package so importing
+`pythainlp.tokenize` will not fail if `budoux` is not installed. When
+used and `budoux` is missing, a clear ImportError is raised with an
+installation hint.
+"""
+from typing import List
+
+_parser = None
+
+
+def _init_parser():
+    """Lazy initialize and return a budoux parser instance.
+
+    Raises ImportError when `budoux` is not installed, and RuntimeError
+    if the installed budoux does not expose a supported API.
+    """
+    try:
+        import budoux
+    except Exception as exc:  # pragma: no cover - defensive import
+        raise ImportError(
+            "budoux is not installed. Install it with: pip install budoux"
+        ) from exc
+
+    return budoux.load_default_thai_parser()
+
+
+def segment(text: str) -> List[str]:
+    """Segment `text` into tokens using budoux.
+
+    The function returns a list of strings. If `budoux` is not available
+    the function raises ImportError with an installation hint.
+    """
+    if not text or not isinstance(text, str):
+        return []
+
+    global _parser
+    if _parser is None:
+        _parser = _init_parser()
+
+    parser = _parser
+
+    result = parser.parse(text)
+
+    return result
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -152,6 +152,8 @@ def word_tokenize(
         * *tltk* - wrapper for
           `TLTK <https://pypi.org/project/tltk/>`_.,
            maximum collocation approach
+        * *budoux* - wrapper for
+          `budoux <https://github.com/google/budoux>`_.
     :Note:
         - The **custom_dict** parameter only works for \
           *deepcut*, *longest*, *newmm*, and *newmm-safe* engines.
@@ -227,7 +229,8 @@ def word_tokenize(
         "nercut",
         "sefr_cut",
         "tltk",
-        "oskut"
+        "oskut",
+        "budoux",
     ):
         raise NotImplementedError(
             f"The {engine} engine does not support custom dictionaries."
@@ -264,6 +267,10 @@ def word_tokenize(
     elif engine == "icu":
         from pythainlp.tokenize.pyicu import segment
 
+        segments = segment(text)
+    elif engine == "budoux":
+        from pythainlp.tokenize.budoux import segment
+
         segments = segment(text)
     elif engine == "nercut":
         from pythainlp.tokenize.nercut import segment
diff --git a/setup.py b/setup.py
@@ -86,6 +86,7 @@
     "thai_nner": ["thai_nner"],
     "thai2fit": ["emoji>=0.5.1", "gensim>=4.0.0", NUMPY],
     "thai2rom": [NUMPY, "torch>=1.0.0"],
+    "budoux": ["budoux>=0.7.0"],
     "translate": [
         'fairseq>=0.10.0,<0.13;python_version<"3.11"',
         'fairseq-fixed==0.12.3.1,<0.13;python_version>="3.11"',
@@ -155,6 +156,7 @@
         "wtpsplit>=1.0.1",
         "wunsen>=0.0.3",
         "word2word>=1.0.0",
+        "budoux>=0.7.0",
     ],
 }
 
diff --git a/tests/extra/testx_tokenize.py b/tests/extra/testx_tokenize.py
@@ -333,3 +333,8 @@ def test_sefr_cut(self):
 class WordTokenizeTLTKTestCase(unittest.TestCase):
     def test_word_tokenize_tltk(self):
         self.assertIsNotNone(word_tokenize(TEXT_1, engine="tltk"))
+
+
+class WordTokenizeBudouxTestCase(unittest.TestCase):
+    def test_word_tokenize_budoux(self):
+        self.assertIsNotNone(word_tokenize(TEXT_1, engine="budoux"))