From e44e0a99c2c5c905bbbf1a431b92fe4c8eb9c204 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Mon, 30 Mar 2026 15:41:25 -0700 Subject: [PATCH 1/3] Add Java error classes to Python side --- python/pyspark/errors/error_classes.py | 36 +++++++++++++++++++--- python/pyspark/errors/tests/test_errors.py | 21 +++++++++++-- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index 12094d61336e0..c641d0e95fdce 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -15,14 +15,42 @@ # limitations under the License. # +import glob import json import importlib.resources +import os +import zipfile +from pyspark.find_spark_home import _find_spark_home # Note: Though we call them "error classes" here, the proper name is "error conditions", # hence why the name of the JSON file is different. # For more information, please see: https://issues.apache.org/jira/browse/SPARK-46810 # This discrepancy will be resolved as part of: https://issues.apache.org/jira/browse/SPARK-47429 -ERROR_CLASSES_JSON = ( - importlib.resources.files("pyspark.errors").joinpath("error-conditions.json").read_text() -) -ERROR_CLASSES_MAP = json.loads(ERROR_CLASSES_JSON) + + +def get_error_classes(): + python_error_classes_json = ( + importlib.resources.files("pyspark.errors").joinpath("error-conditions.json").read_text() + ) + python_error_classes_map = json.loads(python_error_classes_json) + + # We load the Java error classes from the jars so Python recognizes them too + java_error_classes_map = {} + spark_home = _find_spark_home() + + # Released spark packages have the jars in SPARK_HOME/jars, and development builds have them + # in assembly/target + for bin_dir in ("jars", "assembly/target/scala-*/jars"): + bin_path = os.path.join(spark_home, bin_dir) + jars = glob.glob(os.path.join(bin_path, "spark-common-utils_*.jar")) + if jars: + with zipfile.ZipFile(jars[0]) as zf: + with zf.open("error/error-conditions.json") as f: + java_error_classes_json = f.read().decode("utf-8") + java_error_classes_map = json.loads(java_error_classes_json) + break + + return java_error_classes_map | python_error_classes_map + + +ERROR_CLASSES_MAP = get_error_classes() diff --git a/python/pyspark/errors/tests/test_errors.py b/python/pyspark/errors/tests/test_errors.py index 7c25c6965cbd1..8d64c0decd662 100644 --- a/python/pyspark/errors/tests/test_errors.py +++ b/python/pyspark/errors/tests/test_errors.py @@ -16,19 +16,23 @@ # limitations under the License. # +import importlib.resources import json import unittest from pyspark.errors import PySparkRuntimeError, PySparkValueError -from pyspark.errors.error_classes import ERROR_CLASSES_JSON from pyspark.errors.utils import ErrorClassesReader class ErrorsTest(unittest.TestCase): def test_error_classes_sorted(self): # Test error classes is sorted alphabetically - error_reader = ErrorClassesReader() - error_class_names = list(error_reader.error_info_map.keys()) + ERROR_CLASSES_JSON = ( + importlib.resources.files("pyspark.errors") + .joinpath("error-conditions.json") + .read_text() + ) + error_class_names = list(json.loads(ERROR_CLASSES_JSON).keys()) for i in range(len(error_class_names) - 1): self.assertTrue( error_class_names[i] < error_class_names[i + 1], @@ -48,6 +52,12 @@ def detect_duplication(pairs): error_classes_json[name] = message return error_classes_json + ERROR_CLASSES_JSON = ( + importlib.resources.files("pyspark.errors") + .joinpath("error-conditions.json") + .read_text() + ) + json.loads(ERROR_CLASSES_JSON, object_pairs_hook=detect_duplication) def test_invalid_error_class(self): @@ -108,6 +118,11 @@ def test_breaking_change_info(self): subclass_map = error_reader.error_info_map["TEST_ERROR_WITH_SUB_CLASS"]["sub_class"] self.assertEqual(breaking_change_info2, subclass_map["SUBCLASS"]["breaking_change_info"]) + def test_java_error_classes(self): + error_reader = ErrorClassesReader() + msg = error_reader.get_error_message("AGGREGATE_OUT_OF_MEMORY", {}) + self.assertEqual(msg, "No enough memory for aggregation") + def test_sqlstate(self): error = PySparkRuntimeError(errorClass="APPLICATION_NAME_NOT_SET", messageParameters={}) self.assertIsNone(error.getSqlState()) From 1138a5332d926fb89f4824adf0993467e978bc3b Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Mon, 30 Mar 2026 17:29:07 -0700 Subject: [PATCH 2/3] Add type hint --- python/pyspark/errors/error_classes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/errors/error_classes.py b/python/pyspark/errors/error_classes.py index c641d0e95fdce..30844ffecd317 100644 --- a/python/pyspark/errors/error_classes.py +++ b/python/pyspark/errors/error_classes.py @@ -28,7 +28,7 @@ # This discrepancy will be resolved as part of: https://issues.apache.org/jira/browse/SPARK-47429 -def get_error_classes(): +def get_error_classes() -> dict[str, dict]: python_error_classes_json = ( importlib.resources.files("pyspark.errors").joinpath("error-conditions.json").read_text() ) From 3f1e67c5971c46956f73108b0d7d215cee76d8f1 Mon Sep 17 00:00:00 2001 From: Tian Gao Date: Mon, 30 Mar 2026 17:29:44 -0700 Subject: [PATCH 3/3] Fix error doc gen --- python/pyspark/errors_doc_gen.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/python/pyspark/errors_doc_gen.py b/python/pyspark/errors_doc_gen.py index 53b8b8d1e12f1..93187e3955709 100644 --- a/python/pyspark/errors_doc_gen.py +++ b/python/pyspark/errors_doc_gen.py @@ -1,7 +1,7 @@ +import importlib.resources +import json import re -from pyspark.errors.error_classes import ERROR_CLASSES_MAP - def generate_errors_doc(output_rst_file_path: str) -> None: """ @@ -47,7 +47,13 @@ def generate_errors_doc(output_rst_file_path: str) -> None: """ with open(output_rst_file_path, "w") as f: f.write(header + "\n\n") - for error_key, error_details in ERROR_CLASSES_MAP.items(): + python_error_classes_json = ( + importlib.resources.files("pyspark.errors") + .joinpath("error-conditions.json") + .read_text() + ) + python_error_classes_map = json.loads(python_error_classes_json) + for error_key, error_details in python_error_classes_map.items(): f.write(error_key + "\n") # The length of the error class name and underline must be the same # to satisfy the RST format.