Skip to content

Commit 75eeaf2

Browse files
authored
Merge pull request #45 from histogrammar/fix_null_values_pandas
Improve null handling in pandas dataframes
2 parents 2207fc9 + e417069 commit 75eeaf2

File tree

8 files changed

+100
-11
lines changed

8 files changed

+100
-11
lines changed

CHANGES.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
Release notes
33
=============
44

5+
Version 1.0.25, Apr 2021
6+
------------------------
7+
* Improve null handling in pandas dataframes, by inferring datatype using pandas' infer_dtype function.
8+
* nans in bool columns get converted to "NaN", so the column keeps True and False values in Categorize.
9+
* columns of type object get converted to strings using to_string(), of type string uses only_str().
10+
511
Version 1.0.24, Apr 2021
612
------------------------
713
* Categorize histogram now handles nones and nans in friendlier way, they are converted to "NaN".

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ Histograms and other aggregators may also be converted into CUDA code for inclus
1919
PyCUDA is available, they can also be filled from Numpy arrays by JIT-compiling the CUDA code.
2020
This Python implementation of histogrammar been tested to guarantee compatibility with its Scala implementation.
2121

22-
Latest Python release: v1.0.24 (April 2021).
22+
Latest Python release: v1.0.25 (April 2021).
2323

2424
Announcements
2525
=============

histogrammar/dfinterface/filling_utils.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ def check_dtype(dtype):
4949
# this converts pandas types, such as pd.Int64, into numpy types
5050
dtype = type(dtype.type())
5151
dtype = np.dtype(dtype).type
52-
if dtype in {np.str_, np.string_, np.object_}:
52+
if dtype in {np.str_, np.string_}:
5353
dtype = np.dtype(str).type
54+
# MB 20210404: nb.object_ is kept an object -> uses to_string(). str uses only_str()
5455
except BaseException:
5556
raise RuntimeError(f'unknown assigned datatype "{dtype}"')
5657
return dtype
@@ -95,11 +96,9 @@ def to_str(val):
9596
)
9697
)
9798
)
98-
9999
elif hasattr(val, "__str__"):
100100
return str(val)
101-
102-
return ""
101+
return "None"
103102

104103

105104
def only_str(val):
@@ -127,9 +126,9 @@ def only_bool(val):
127126
return val
128127
elif hasattr(val, "__iter__") and not isinstance(val, str):
129128
return np.asarray(
130-
[s if isinstance(s, (np.bool_, bool)) else np.nan for s in val]
129+
[s if isinstance(s, (np.bool_, bool)) else "NaN" for s in val]
131130
)
132-
return np.nan
131+
return "NaN"
133132

134133

135134
def only_int(val):
@@ -165,6 +164,9 @@ def only_float(val):
165164

166165

167166
QUANTITY = {
167+
# MB 20210404: to_string for object types b/c it's a mixed type
168+
np.object: to_str,
169+
np.object_: to_str,
168170
str: only_str,
169171
np.str_: only_str,
170172
int: only_int,

histogrammar/dfinterface/histogram_filler_base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ def categorize_features(self, df):
405405
for col_list in features:
406406
for col in col_list:
407407

408-
dt = check_dtype(self.get_data_type(df, col))
408+
dt = self.var_dtype.get(col, check_dtype(self.get_data_type(df, col)))
409409

410410
if col not in self.var_dtype:
411411
self.var_dtype[col] = dt

histogrammar/dfinterface/pandas_histogrammar.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@
1111

1212
import histogrammar as hg
1313
import joblib
14+
import numpy as np
1415
import pandas as pd
1516
from joblib import Parallel, delayed
1617
from tqdm import tqdm
18+
from pandas.api.types import infer_dtype
1719

1820
from .filling_utils import to_ns, QUANTITY
1921
from .histogram_filler_base import HistogramFillerBase
@@ -122,6 +124,31 @@ def get_features(self, df):
122124
"""
123125
return df.columns.tolist()
124126

127+
def get_data_type(self, df, col):
128+
"""Get data type of dataframe column.
129+
130+
:param df: input data frame
131+
:param str col: column
132+
"""
133+
if col not in df.columns:
134+
raise KeyError(f'column "{col:s}" not in input dataframe')
135+
136+
inferred = infer_dtype(df[col], skipna=True)
137+
if inferred in 'string':
138+
data_type = 'str'
139+
elif inferred == 'integer':
140+
data_type = 'int'
141+
elif inferred == 'boolean':
142+
data_type = 'bool'
143+
elif inferred in {'decimal', 'floating', 'mixed-integer-float'}:
144+
data_type = 'float'
145+
elif inferred in {'date', 'datetime', 'datetime64'}:
146+
data_type = 'datetime64'
147+
else: # categorical, mixed, etc -> object uses to_string()
148+
data_type = np.object_
149+
150+
return data_type
151+
125152
def get_quantiles(self, df, quantiles=[0.05, 0.95], columns=[]):
126153
"""return dict with quantiles for given columns
127154

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323
MAJOR = 1
2424
REVISION = 0
25-
PATCH = 24
25+
PATCH = 25
2626
DEV = False
2727
# NOTE: also update version at: README.rst
2828

tests/test_pandas_histogrammar.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#!/usr/bin/env python3
22

33
import numpy as np
4+
import pandas as pd
45
import pytest
56

67
from histogrammar.dfinterface.pandas_histogrammar import PandasHistogrammar
@@ -227,3 +228,56 @@ def test_get_histograms_module():
227228
def test_get_time_axes():
228229
time_axes = get_time_axes(pytest.test_df)
229230
np.testing.assert_array_equal(time_axes, ["date"])
231+
232+
233+
def test_null_histograms():
234+
d = {'transaction': {0: np.nan, 1: 1.0, 2: np.nan, 3: 3.0, 4: 4.0},
235+
'isActive': {0: None, 1: None, 2: True, 3: True, 4: False},
236+
'eyeColor': {0: None, 1: None, 2: 'Jones', 3: 'USA', 4: 'FL'},
237+
't2': {0: np.nan, 1: 2.0, 2: np.nan, 3: 4.0, 4: 5.0},
238+
'foo': {0: np.nan, 1: np.nan, 2: np.nan, 3: True, 4: False},
239+
'bar': {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'},
240+
'bla': {0: 1, 1: 2, 2: 3, 3: 4, 4: np.nan},
241+
'mixed': {0: 'a', 1: 'b', 2: 'c', 3: np.nan, 4: 1}}
242+
df = pd.DataFrame(d)
243+
df['bar'] = df['bar'].astype('category')
244+
245+
hists = make_histograms(df, bin_specs={'transaction': {'num': 40, 'low': 0, 'high': 10}})
246+
247+
assert 'transaction' in hists
248+
assert 'isActive' in hists
249+
assert 'eyeColor' in hists
250+
assert 't2' in hists
251+
assert 'foo' in hists
252+
assert 'bar' in hists
253+
assert 'bla' in hists
254+
assert 'bla' in hists
255+
assert 'mixed' in hists
256+
257+
h = hists['transaction']
258+
assert h.nanflow.entries == 2
259+
260+
h = hists['t2']
261+
assert h.nanflow.entries == 2
262+
263+
h = hists['isActive']
264+
assert 'NaN' in h.bins
265+
assert h.bins['NaN'].entries == 2
266+
267+
h = hists['eyeColor']
268+
assert 'None' in h.bins
269+
assert h.bins['None'].entries == 2
270+
271+
h = hists['foo']
272+
assert 'NaN' in h.bins
273+
assert h.bins['NaN'].entries == 3
274+
275+
h = hists['bar']
276+
assert 'NaN' not in h.bins
277+
278+
h = hists['bla']
279+
assert h.nanflow.entries == 1
280+
281+
h = hists['mixed']
282+
assert 'nan' in h.bins
283+
assert h.bins['nan'].entries == 1

tests/test_spark_histogrammar.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -264,8 +264,8 @@ def test_get_histograms_date(spark_co):
264264
def test_null_histograms(spark_co):
265265
spark = spark_co
266266

267-
data = [(None, None, None, None), (1, None, None, 2), (None, True, "Jones", None), (3, True, "USA", 4),
268-
(4, False, "FL", 5)]
267+
data = [(None, None, None, None), (1, None, None, 2.), (None, True, "Jones", None), (3, True, "USA", 4.),
268+
(4, False, "FL", 5.)]
269269
columns = ["transaction", "isActive", "eyeColor", "t2"]
270270
sdf = spark.createDataFrame(data=data, schema=columns)
271271

0 commit comments

Comments
 (0)