Merge pull request #45 from histogrammar/fix_null_values_pandas

mbaak · web-flow · commit 75eeaf2c74eb · 2021-04-05T10:15:31.000+02:00
Improve null handling in pandas dataframes
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,6 +2,12 @@
 Release notes
 =============
 
+Version 1.0.25, Apr 2021
+------------------------
+* Improve null handling in pandas dataframes, by inferring datatype using pandas' infer_dtype function.
+* nans in bool columns get converted to "NaN", so the column keeps True and False values in Categorize.
+* columns of type object get converted to strings using to_string(), of type string uses only_str().
+
 Version 1.0.24, Apr 2021
 ------------------------
 * Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN".
diff --git a/README.rst b/README.rst
@@ -19,7 +19,7 @@ Histograms and other aggregators may also be converted into CUDA code for inclus
 PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling the CUDA code.
 This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
 
-Latest Python release: v1.0.24 (April 2021).
+Latest Python release: v1.0.25 (April 2021).
 
 Announcements
 =============
diff --git a/histogrammar/dfinterface/filling_utils.py b/histogrammar/dfinterface/filling_utils.py
@@ -49,8 +49,9 @@ def check_dtype(dtype):
             # this converts pandas types, such as pd.Int64, into numpy types
             dtype = type(dtype.type())
         dtype = np.dtype(dtype).type
-        if dtype in {np.str_, np.string_, np.object_}:
+        if dtype in {np.str_, np.string_}:
             dtype = np.dtype(str).type
+        # MB 20210404: nb.object_ is kept an object -> uses to_string(). str uses only_str()
     except BaseException:
         raise RuntimeError(f'unknown assigned datatype "{dtype}"')
     return dtype
@@ -95,11 +96,9 @@ def to_str(val):
                 )
             )
         )
-
     elif hasattr(val, "__str__"):
         return str(val)
-
-    return ""
+    return "None"
 
 
 def only_str(val):
@@ -127,9 +126,9 @@ def only_bool(val):
         return val
     elif hasattr(val, "__iter__") and not isinstance(val, str):
         return np.asarray(
-            [s if isinstance(s, (np.bool_, bool)) else np.nan for s in val]
+            [s if isinstance(s, (np.bool_, bool)) else "NaN" for s in val]
         )
-    return np.nan
+    return "NaN"
 
 
 def only_int(val):
@@ -165,6 +164,9 @@ def only_float(val):
 
 
 QUANTITY = {
+    # MB 20210404: to_string for object types b/c it's a mixed type
+    np.object: to_str,
+    np.object_: to_str,
     str: only_str,
     np.str_: only_str,
     int: only_int,
diff --git a/histogrammar/dfinterface/histogram_filler_base.py b/histogrammar/dfinterface/histogram_filler_base.py
@@ -405,7 +405,7 @@ def categorize_features(self, df):
         for col_list in features:
             for col in col_list:
 
-                dt = check_dtype(self.get_data_type(df, col))
+                dt = self.var_dtype.get(col, check_dtype(self.get_data_type(df, col)))
 
                 if col not in self.var_dtype:
                     self.var_dtype[col] = dt
diff --git a/histogrammar/dfinterface/pandas_histogrammar.py b/histogrammar/dfinterface/pandas_histogrammar.py
@@ -11,9 +11,11 @@
 
 import histogrammar as hg
 import joblib
+import numpy as np
 import pandas as pd
 from joblib import Parallel, delayed
 from tqdm import tqdm
+from pandas.api.types import infer_dtype
 
 from .filling_utils import to_ns, QUANTITY
 from .histogram_filler_base import HistogramFillerBase
@@ -122,6 +124,31 @@ def get_features(self, df):
         """
         return df.columns.tolist()
 
+    def get_data_type(self, df, col):
+        """Get data type of dataframe column.
+
+        :param df: input data frame
+        :param str col: column
+        """
+        if col not in df.columns:
+            raise KeyError(f'column "{col:s}" not in input dataframe')
+
+        inferred = infer_dtype(df[col], skipna=True)
+        if inferred in 'string':
+            data_type = 'str'
+        elif inferred == 'integer':
+            data_type = 'int'
+        elif inferred == 'boolean':
+            data_type = 'bool'
+        elif inferred in {'decimal', 'floating', 'mixed-integer-float'}:
+            data_type = 'float'
+        elif inferred in {'date', 'datetime', 'datetime64'}:
+            data_type = 'datetime64'
+        else:  # categorical, mixed, etc -> object uses to_string()
+            data_type = np.object_
+
+        return data_type
+
     def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]):
         """return dict with quantiles for given columns
 
diff --git a/setup.py b/setup.py
@@ -22,7 +22,7 @@
 
 MAJOR = 1
 REVISION = 0
-PATCH = 24
+PATCH = 25
 DEV = False
 # NOTE: also update version at: README.rst
 
diff --git a/tests/test_pandas_histogrammar.py b/tests/test_pandas_histogrammar.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import numpy as np
+import pandas as pd
 import pytest
 
 from histogrammar.dfinterface.pandas_histogrammar import PandasHistogrammar
@@ -227,3 +228,56 @@ def test_get_histograms_module():
 def test_get_time_axes():
     time_axes = get_time_axes(pytest.test_df)
     np.testing.assert_array_equal(time_axes, ["date"])
+
+
+def test_null_histograms():
+    d = {'transaction': {0: np.nan, 1: 1.0, 2: np.nan, 3: 3.0, 4: 4.0},
+         'isActive': {0: None, 1: None, 2: True, 3: True, 4: False},
+         'eyeColor': {0: None, 1: None, 2: 'Jones', 3: 'USA', 4: 'FL'},
+         't2': {0: np.nan, 1: 2.0, 2: np.nan, 3: 4.0, 4: 5.0},
+         'foo': {0: np.nan, 1: np.nan, 2: np.nan, 3: True, 4: False},
+         'bar': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'},
+         'bla': {0: 1, 1: 2, 2: 3, 3: 4, 4: np.nan},
+         'mixed': {0: 'a', 1: 'b', 2: 'c', 3: np.nan, 4: 1}}
+    df = pd.DataFrame(d)
+    df['bar'] = df['bar'].astype('category')
+
+    hists = make_histograms(df, bin_specs={'transaction': {'num': 40, 'low': 0, 'high': 10}})
+
+    assert 'transaction' in hists
+    assert 'isActive' in hists
+    assert 'eyeColor' in hists
+    assert 't2' in hists
+    assert 'foo' in hists
+    assert 'bar' in hists
+    assert 'bla' in hists
+    assert 'bla' in hists
+    assert 'mixed' in hists
+
+    h = hists['transaction']
+    assert h.nanflow.entries == 2
+
+    h = hists['t2']
+    assert h.nanflow.entries == 2
+
+    h = hists['isActive']
+    assert 'NaN' in h.bins
+    assert h.bins['NaN'].entries == 2
+
+    h = hists['eyeColor']
+    assert 'None' in h.bins
+    assert h.bins['None'].entries == 2
+
+    h = hists['foo']
+    assert 'NaN' in h.bins
+    assert h.bins['NaN'].entries == 3
+
+    h = hists['bar']
+    assert 'NaN' not in h.bins
+
+    h = hists['bla']
+    assert h.nanflow.entries == 1
+
+    h = hists['mixed']
+    assert 'nan' in h.bins
+    assert h.bins['nan'].entries == 1
diff --git a/tests/test_spark_histogrammar.py b/tests/test_spark_histogrammar.py
@@ -264,8 +264,8 @@ def test_get_histograms_date(spark_co):
 def test_null_histograms(spark_co):
     spark = spark_co
 
-    data = [(None, None, None, None), (1, None, None, 2), (None, True, "Jones", None), (3, True, "USA", 4),
-            (4, False, "FL", 5)]
+    data = [(None, None, None, None), (1, None, None, 2.), (None, True, "Jones", None), (3, True, "USA", 4.),
+            (4, False, "FL", 5.)]
     columns = ["transaction", "isActive", "eyeColor", "t2"]
     sdf = spark.createDataFrame(data=data, schema=columns)