From a68ad35d96d6d1740c8e1d4d62f19d9dfe1e2eaf Mon Sep 17 00:00:00 2001 From: Ruifeng Zheng Date: Wed, 1 Apr 2026 01:50:31 +0000 Subject: [PATCH] [PYSPARK] Fix test_pyarrow_array_type_inference for pandas >= 3 pandas 3.x changed default string dtype to use pyarrow-backed storage, causing pa.array() to infer large_string instead of string for string Series. Conditionally expect large_string on pandas >= 3. Co-authored-by: Isaac --- .../pyarrow/test_pyarrow_array_type_inference.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py index 1c859fc9de2a1..7e60f552e9c7a 100644 --- a/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py +++ b/python/pyspark/tests/upstream/pyarrow/test_pyarrow_array_type_inference.py @@ -295,6 +295,10 @@ def test_pandas_series_numpy_backed(self): import numpy as np import pandas as pd import pyarrow as pa + from pyspark.loose_version import LooseVersion + + # pandas >= 3 infers large_string instead of string for object-dtype string Series + string_type = pa.large_string() if LooseVersion(pd.__version__) >= "3.0.0" else pa.string() sg = ZoneInfo("Asia/Singapore") la = "America/Los_Angeles" @@ -315,7 +319,7 @@ def test_pandas_series_numpy_backed(self): (pd.Series([np.inf, 1.0, 2.0]), pa.float64()), (pd.Series([-np.inf, 1.0, 2.0]), pa.float64()), # String - (pd.Series(["a", "b", "c"]), pa.string()), + (pd.Series(["a", "b", "c"]), string_type), # Boolean (pd.Series([True, False, True]), pa.bool_()), # Temporal @@ -356,6 +360,10 @@ def test_pandas_series_nullable_extension(self): import numpy as np import pandas as pd import pyarrow as pa + from pyspark.loose_version import LooseVersion + + # pandas >= 3 uses pyarrow-backed StringDtype, which infers large_string + string_type = pa.large_string() if LooseVersion(pd.__version__) >= "3.0.0" else pa.string() cases = [ # Integer @@ -379,8 +387,8 @@ def test_pandas_series_nullable_extension(self): (pd.Series([True, False, True], dtype=pd.BooleanDtype()), pa.bool_()), (pd.Series([True, False, None], dtype=pd.BooleanDtype()), pa.bool_()), # String - (pd.Series(["a", "b", "c"], dtype=pd.StringDtype()), pa.string()), - (pd.Series(["a", "b", None], dtype=pd.StringDtype()), pa.string()), + (pd.Series(["a", "b", "c"], dtype=pd.StringDtype()), string_type), + (pd.Series(["a", "b", None], dtype=pd.StringDtype()), string_type), ] self._run_inference_tests(cases)