From 6de8c2f3f1c6bff903e743b5d31541fafef91400 Mon Sep 17 00:00:00 2001 From: Piyush Kanti Chanda Date: Sun, 29 Mar 2026 16:51:53 +0000 Subject: [PATCH] [SPARK-54986][DOCS] Document return types for aggregate functions Add Notes sections to PySpark docstrings for aggregate functions documenting their return data types, which were previously undocumented. - stddev, std, stddev_samp, stddev_pop, variance, var_samp, var_pop, skewness, kurtosis, corr, covar_pop, covar_samp: always DoubleType - avg, mean: DoubleType for numerics, DecimalType for decimals, interval type for intervals - sum: LongType for integrals, DecimalType for decimals, DoubleType for other numerics, interval type for intervals Closes #54986 Co-authored-by: Isaac --- python/pyspark/sql/functions/builtin.py | 97 +++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py index f8d18b8e9b5ce..14642f4cc03b9 100644 --- a/python/pyspark/sql/functions/builtin.py +++ b/python/pyspark/sql/functions/builtin.py @@ -1643,6 +1643,15 @@ def sum(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the column for computed results. + Notes + ----- + - Null values are ignored during the computation. + - The return type depends on the input: + :class:`~pyspark.sql.types.LongType` for integral inputs, + :class:`~pyspark.sql.types.DecimalType` for decimal inputs, + :class:`~pyspark.sql.types.DoubleType` for other numeric inputs, or an interval type + for interval inputs. + Examples -------- Example 1: Calculating the sum of values in a column @@ -1701,6 +1710,14 @@ def avg(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the column for computed results. + Notes + ----- + - Null values are ignored during the computation. + - The return type depends on the input: + :class:`~pyspark.sql.types.DoubleType` for numeric inputs, + :class:`~pyspark.sql.types.DecimalType` for decimal inputs, or an interval type + for interval inputs. + Examples -------- Example 1: Calculating the average age @@ -1749,6 +1766,14 @@ def mean(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` the column for computed results. + Notes + ----- + - Null values are ignored during the computation. + - The return type depends on the input: + :class:`~pyspark.sql.types.DoubleType` for numeric inputs, + :class:`~pyspark.sql.types.DecimalType` for decimal inputs, or an interval type + for interval inputs. + Examples -------- Example 1: Calculating the average age @@ -4250,6 +4275,12 @@ def stddev(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` standard deviation of given column. + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> import pyspark.sql.functions as sf @@ -4289,6 +4320,12 @@ def std(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.skewness` :meth:`pyspark.sql.functions.kurtosis` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> import pyspark.sql.functions as sf @@ -4330,6 +4367,12 @@ def stddev_samp(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.stddev_pop` :meth:`pyspark.sql.functions.var_samp` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> import pyspark.sql.functions as sf @@ -4371,6 +4414,12 @@ def stddev_pop(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.stddev_samp` :meth:`pyspark.sql.functions.var_pop` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> import pyspark.sql.functions as sf @@ -4412,6 +4461,12 @@ def variance(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.skewness` :meth:`pyspark.sql.functions.kurtosis` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -4453,6 +4508,12 @@ def var_samp(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.var_pop` :meth:`pyspark.sql.functions.std_samp` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -4493,6 +4554,12 @@ def var_pop(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.var_samp` :meth:`pyspark.sql.functions.std_pop` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -5777,6 +5844,12 @@ def skewness(col: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` skewness of given column. + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -5818,6 +5891,12 @@ def kurtosis(col: "ColumnOrName") -> Column: :meth:`pyspark.sql.functions.variance` :meth:`pyspark.sql.functions.skewness` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -6751,6 +6830,12 @@ def corr(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: :class:`~pyspark.sql.Column` Pearson Correlation Coefficient of these two column values. + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -6793,6 +6878,12 @@ def covar_pop(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: -------- :meth:`pyspark.sql.functions.covar_samp` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf @@ -6835,6 +6926,12 @@ def covar_samp(col1: "ColumnOrName", col2: "ColumnOrName") -> Column: -------- :meth:`pyspark.sql.functions.covar_pop` + Notes + ----- + - Null values are ignored during the computation. + - The result is always a :class:`~pyspark.sql.types.DoubleType` column, + regardless of the input column type. + Examples -------- >>> from pyspark.sql import functions as sf