Merge pull request #63 from histogrammar/decimal-support

mbaak · web-flow · commit 217bb5949fdf · 2022-09-08T19:05:06.000+02:00
Decimal support
diff --git a/histogrammar/dfinterface/filling_utils.py b/histogrammar/dfinterface/filling_utils.py
@@ -38,7 +38,7 @@ def check_column(col, sep=":"):
     return col
 
 
-def check_dtype(dtype):
+def normalize_dtype(dtype):
     """Convert datatype to consistent numpy datatype
 
     :param dtype: input datatype
diff --git a/histogrammar/dfinterface/histogram_filler_base.py b/histogrammar/dfinterface/histogram_filler_base.py
@@ -27,7 +27,7 @@
 from ..primitives.stack import Stack
 from ..primitives.sum import Sum
 
-from .filling_utils import check_column, check_dtype
+from .filling_utils import check_column, normalize_dtype
 
 
 class HistogramFillerBase(object):
@@ -111,7 +111,7 @@ def __init__(
         self.bin_specs = bin_specs or {}
         self.time_axis = time_axis
         var_dtype = var_dtype or {}
-        self.var_dtype = {k: check_dtype(v) for k, v in var_dtype.items()}
+        self.var_dtype = {k: normalize_dtype(v) for k, v in var_dtype.items()}
         self.read_key = read_key
         self.store_key = store_key
 
@@ -404,32 +404,31 @@ def categorize_features(self, df):
 
         for col_list in features:
             for col in col_list:
+                # data type with metadata
+                dt_col = self.get_data_type(df, col)
 
-                dt = self.var_dtype.get(col, check_dtype(self.get_data_type(df, col)))
+                # normalized data type
+                dt = self.var_dtype.get(col, normalize_dtype(dt_col))
 
                 if col not in self.var_dtype:
                     self.var_dtype[col] = dt
 
+                # metadata indicates decimal
+                if hasattr(dt_col, 'metadata') and dt_col.metadata is not None and dt_col.metadata["decimal"]:
+                    cols_by_type["decimal"].add(col)
+
                 if np.issubdtype(dt, np.integer):
-                    colset = cols_by_type["int"]
-                    if col not in colset:
-                        colset.add(col)
+                    cols_by_type["int"].add(col)
+
                 if np.issubdtype(dt, np.number):
                     colset = cols_by_type["num"]
-                    if col not in colset:
-                        colset.add(col)
                 elif np.issubdtype(dt, np.datetime64):
                     colset = cols_by_type["dt"]
-                    if col not in colset:
-                        colset.add(col)
                 elif np.issubdtype(dt, np.bool_):
                     colset = cols_by_type["bool"]
-                    if col not in colset:
-                        colset.add(col)
                 else:
                     colset = cols_by_type["str"]
-                    if col not in colset:
-                        colset.add(col)
+                colset.add(col)
 
                 self.logger.debug(
                     'Data type of column "{col}" is "{type}".'.format(
diff --git a/histogrammar/dfinterface/make_histograms.py b/histogrammar/dfinterface/make_histograms.py
@@ -42,7 +42,7 @@
 
 from .pandas_histogrammar import PandasHistogrammar
 from .spark_histogrammar import SparkHistogrammar
-from .filling_utils import check_dtype
+from .filling_utils import normalize_dtype
 from ..util import _get_sub_hist
 
 logger = logging.getLogger()
@@ -232,7 +232,7 @@ def get_time_axes(df):
     return [
         c
         for c in df.columns
-        if np.issubdtype(check_dtype(get_data_type(df, c)), np.datetime64)
+        if np.issubdtype(normalize_dtype(get_data_type(df, c)), np.datetime64)
     ]
 
 
diff --git a/histogrammar/dfinterface/pandas_histogrammar.py b/histogrammar/dfinterface/pandas_histogrammar.py
@@ -136,7 +136,11 @@ def get_data_type(self, df, col):
         elif inferred == 'boolean':
             data_type = 'bool'
         elif inferred in {'decimal', 'floating', 'mixed-integer-float'}:
-            data_type = 'float'
+            # decimal needs preprocessing (cast), signal this in metadata
+            if inferred == "decimal":
+                data_type = np.dtype('float', metadata={"decimal": True})
+            else:
+                data_type = "float"
         elif inferred in {'date', 'datetime', 'datetime64'}:
             data_type = 'datetime64'
         else:  # categorical, mixed, etc -> object uses to_string()
@@ -187,6 +191,12 @@ def process_features(self, df, cols_by_type):
                 )
             )
             idf[col] = df[col].apply(to_ns)
+
+        # treat decimal as float, as decimal is not supported by .quantile
+        # (https://github.com/pandas-dev/pandas/issues/13157)
+        for col in cols_by_type["decimal"]:
+            idf[col] = df[col].apply(float)
+
         return idf
 
     def fill_histograms(self, idf):
diff --git a/histogrammar/dfinterface/spark_histogrammar.py b/histogrammar/dfinterface/spark_histogrammar.py
@@ -169,6 +169,8 @@ def get_data_type(self, df, col):
             dt = bool
         elif dt == "bigint":
             dt = np.int64
+        elif dt.startswith("decimal("):
+            return np.dtype(float, metadata={"decimal": True})
 
         return np.dtype(dt)
 
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,3 +1,4 @@
+from decimal import Decimal
 from json import load
 from os.path import dirname
 
@@ -88,4 +89,8 @@ def pytest_configure():
 
     df = pd.read_csv(resources.data(CSV_FILE))
     df["date"] = pd.to_datetime(df["date"])
+
+    # Decimal type
+    df["amount"] = df["balance"].str.replace("$", "", regex=False).str.replace(",", "", regex=False).apply(Decimal)
+
     pytest.test_df = df
diff --git a/tests/test_pandas_histogrammar.py b/tests/test_pandas_histogrammar.py
@@ -13,7 +13,6 @@
 
 
 def test_get_histograms():
-
     pandas_filler = PandasHistogrammar(
         features=[
             "date",
@@ -47,7 +46,6 @@ def test_get_histograms():
 
 
 def test_make_histograms():
-
     features = [
         "date",
         "isActive",
@@ -85,15 +83,14 @@ def test_make_histograms():
 
 
 def test_make_histograms_no_time_axis():
-
     hists, features, bin_specs, time_axis, var_dtype = make_histograms(
         pytest.test_df, time_axis="", ret_specs=True,
     )
 
-    assert len(hists) == 21
-    assert len(features) == 21
-    assert len(bin_specs) == 6
-    assert len(var_dtype) == 21
+    assert len(hists) == 22
+    assert len(features) == 22
+    assert len(bin_specs) == 7
+    assert len(var_dtype) == 22
     assert time_axis == ""
     assert "date" in hists
     h = hists["date"]
@@ -110,15 +107,14 @@ def test_make_histograms_no_time_axis():
 
 
 def test_make_histograms_with_time_axis():
-
     hists, features, bin_specs, time_axis, var_dtype = make_histograms(
         pytest.test_df, time_axis=True, ret_specs=True, time_width=None, time_offset=None
     )
 
-    assert len(hists) == 20
-    assert len(features) == 20
-    assert len(bin_specs) == 20
-    assert len(var_dtype) == 21
+    assert len(hists) == 21
+    assert len(features) == 21
+    assert len(bin_specs) == 21
+    assert len(var_dtype) == 22
     assert time_axis == "date"
     assert "date:age" in hists
     h = hists["date:age"]
@@ -167,10 +163,10 @@ def test_make_histograms_unit_binning():
         pytest.test_df, binning="unit", time_axis="", ret_specs=True
     )
 
-    assert len(hists) == 21
-    assert len(features) == 21
+    assert len(hists) == 22
+    assert len(features) == 22
     assert len(bin_specs) == 0
-    assert len(var_dtype) == 21
+    assert len(var_dtype) == 22
     assert time_axis == ""
     assert "date" in hists
     h = hists["date"]
diff --git a/tests/test_spark_histogrammar.py b/tests/test_spark_histogrammar.py
@@ -30,7 +30,6 @@ def get_spark():
         SparkSession.builder.master("local")
         .appName("histogrammar-pytest")
         .config("spark.jars", f"{hist_spark_jar},{hist_jar}")
-        .config("spark.sql.execution.arrow.enabled", "false")
         .config("spark.sql.session.timeZone", "GMT")
         .getOrCreate()
     )
@@ -81,6 +80,7 @@ def test_get_histograms(spark_co):
             ["isActive", "age"],
             ["latitude", "longitude"],
             "transaction",
+            "amount",
         ],
         bin_specs={
             "transaction": {"num": 100, "low": -2000, "high": 2000},
@@ -140,6 +140,7 @@ def test_get_histograms_module(spark_co):
             "longitude",
             ["isActive", "age"],
             ["latitude", "longitude"],
+            "amount",
         ],
         bin_specs={
             "longitude": {"bin_width": 5.0, "bin_offset": 0.0},

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@`
`42`	`42`
`43`	`43`	`from .pandas_histogrammar import PandasHistogrammar`
`44`	`44`	`from .spark_histogrammar import SparkHistogrammar`
`45`		`-from .filling_utils import check_dtype`
	`45`	`+from .filling_utils import normalize_dtype`
`46`	`46`	`from ..util import _get_sub_hist`
`47`	`47`
`48`	`48`	`logger = logging.getLogger()`
`@@ -232,7 +232,7 @@ def get_time_axes(df):`
`232`	`232`	`return [`
`233`	`233`	`c`
`234`	`234`	`for c in df.columns`
`235`		`- if np.issubdtype(check_dtype(get_data_type(df, c)), np.datetime64)`
	`235`	`+ if np.issubdtype(normalize_dtype(get_data_type(df, c)), np.datetime64)`
`236`	`236`	`]`
`237`	`237`
`238`	`238`