From 6de8c2f3f1c6bff903e743b5d31541fafef91400 Mon Sep 17 00:00:00 2001
From: Piyush Kanti Chanda <piyush.chanda@databricks.com>
Date: Sun, 29 Mar 2026 16:51:53 +0000
Subject: [PATCH] [SPARK-54986][DOCS] Document return types for aggregate
 functions

Add Notes sections to PySpark docstrings for aggregate functions
documenting their return data types, which were previously undocumented.

- stddev, std, stddev_samp, stddev_pop, variance, var_samp, var_pop,
  skewness, kurtosis, corr, covar_pop, covar_samp: always DoubleType
- avg, mean: DoubleType for numerics, DecimalType for decimals,
  interval type for intervals
- sum: LongType for integrals, DecimalType for decimals, DoubleType
  for other numerics, interval type for intervals

Closes #54986

Co-authored-by: Isaac
---
 python/pyspark/sql/functions/builtin.py | 97 +++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
index f8d18b8e9b5ce..14642f4cc03b9 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -1643,6 +1643,15 @@ def sum(col: "ColumnOrName") -> Column:
     :class:`~pyspark.sql.Column`
         the column for computed results.
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The return type depends on the input:
+      :class:`~pyspark.sql.types.LongType` for integral inputs,
+      :class:`~pyspark.sql.types.DecimalType` for decimal inputs,
+      :class:`~pyspark.sql.types.DoubleType` for other numeric inputs, or an interval type
+      for interval inputs.
+
     Examples
     --------
     Example 1: Calculating the sum of values in a column
@@ -1701,6 +1710,14 @@ def avg(col: "ColumnOrName") -> Column:
     :class:`~pyspark.sql.Column`
         the column for computed results.
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The return type depends on the input:
+      :class:`~pyspark.sql.types.DoubleType` for numeric inputs,
+      :class:`~pyspark.sql.types.DecimalType` for decimal inputs, or an interval type
+      for interval inputs.
+
     Examples
     --------
     Example 1: Calculating the average age
@@ -1749,6 +1766,14 @@ def mean(col: "ColumnOrName") -> Column:
     :class:`~pyspark.sql.Column`
         the column for computed results.
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The return type depends on the input:
+      :class:`~pyspark.sql.types.DoubleType` for numeric inputs,
+      :class:`~pyspark.sql.types.DecimalType` for decimal inputs, or an interval type
+      for interval inputs.
+
     Examples
     --------
     Example 1: Calculating the average age
@@ -4250,6 +4275,12 @@ def stddev(col: "ColumnOrName") -> Column:
     :class:`~pyspark.sql.Column`
         standard deviation of given column.
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> import pyspark.sql.functions as sf
@@ -4289,6 +4320,12 @@ def std(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.skewness`
     :meth:`pyspark.sql.functions.kurtosis`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> import pyspark.sql.functions as sf
@@ -4330,6 +4367,12 @@ def stddev_samp(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.stddev_pop`
     :meth:`pyspark.sql.functions.var_samp`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> import pyspark.sql.functions as sf
@@ -4371,6 +4414,12 @@ def stddev_pop(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.stddev_samp`
     :meth:`pyspark.sql.functions.var_pop`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> import pyspark.sql.functions as sf
@@ -4412,6 +4461,12 @@ def variance(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.skewness`
     :meth:`pyspark.sql.functions.kurtosis`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -4453,6 +4508,12 @@ def var_samp(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.var_pop`
     :meth:`pyspark.sql.functions.std_samp`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -4493,6 +4554,12 @@ def var_pop(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.var_samp`
     :meth:`pyspark.sql.functions.std_pop`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -5777,6 +5844,12 @@ def skewness(col: "ColumnOrName") -> Column:
     :class:`~pyspark.sql.Column`
         skewness of given column.
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -5818,6 +5891,12 @@ def kurtosis(col: "ColumnOrName") -> Column:
     :meth:`pyspark.sql.functions.variance`
     :meth:`pyspark.sql.functions.skewness`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -6751,6 +6830,12 @@ def corr(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
     :class:`~pyspark.sql.Column`
         Pearson Correlation Coefficient of these two column values.
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -6793,6 +6878,12 @@ def covar_pop(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
     --------
     :meth:`pyspark.sql.functions.covar_samp`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf
@@ -6835,6 +6926,12 @@ def covar_samp(col1: "ColumnOrName", col2: "ColumnOrName") -> Column:
     --------
     :meth:`pyspark.sql.functions.covar_pop`
 
+    Notes
+    -----
+    - Null values are ignored during the computation.
+    - The result is always a :class:`~pyspark.sql.types.DoubleType` column,
+      regardless of the input column type.
+
     Examples
     --------
     >>> from pyspark.sql import functions as sf