-
Notifications
You must be signed in to change notification settings - Fork 38
Simple Impute for timeseries #975
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
61d361b
73aa16a
a5d9b27
ec4f791
1b1abc2
e712a46
3bfe0ce
058a177
6c23648
810f68a
bf207a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,7 +92,9 @@ def _is_val_missing(data: np.ndarray) -> np.ndarray[Any, np.dtype[np.bool_]]: | |
| raise AssertionError("Values outside imputed columns were modified.") | ||
|
|
||
| # Ensure imputation does not alter non-NaN values in the imputed columns | ||
| imputed_non_nan_mask = (~before_nan_mask) & imputed_mask | ||
| imputed_non_nan_mask = (~before_nan_mask) & ( | ||
| imputed_mask[None, :] if layer_before.ndim == 2 else imputed_mask[None, :, None] | ||
| ) | ||
| if not _are_ndarrays_equal(layer_before[imputed_non_nan_mask], layer_after[imputed_non_nan_mask]): | ||
| raise AssertionError("Non-NaN values in imputed columns were modified.") | ||
|
|
||
|
|
@@ -147,10 +149,34 @@ def test_base_check_imputation_change_detected_in_imputed_column(impute_num_edat | |
| _base_check_imputation(impute_num_edata, edata_imputed) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize( | ||
| "array_type,expected_error", | ||
| [ | ||
| (np.array, None), | ||
| (da.array, None), | ||
| (sparse.csr_matrix, NotImplementedError), | ||
eroell marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| ], | ||
| ) | ||
| def test_simple_impute_array_types(impute_num_edata, array_type, expected_error): | ||
eroell marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| impute_num_edata.X = array_type(impute_num_edata.X) | ||
|
|
||
| if expected_error: | ||
| with pytest.raises(expected_error): | ||
| simple_impute(impute_num_edata, strategy="mean") | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("array_type", ARRAY_TYPES) | ||
| @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) | ||
| def test_simple_impute_basic(impute_num_edata, strategy): | ||
| edata_imputed = simple_impute(impute_num_edata, strategy=strategy, copy=True) | ||
| _base_check_imputation(impute_num_edata, edata_imputed) | ||
| def test_simple_impute_basic(impute_num_edata, array_type, strategy): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Explanation: This test and the following one are specific for the function, which is about imputation here |
||
| impute_num_edata.X = array_type(impute_num_edata.X) | ||
|
|
||
| if isinstance(impute_num_edata.X, da.Array) and strategy != "mean": | ||
| with pytest.raises(ValueError): | ||
| edata_imputed = simple_impute(impute_num_edata, strategy=strategy, copy=True) | ||
|
|
||
| else: | ||
| edata_imputed = simple_impute(impute_num_edata, strategy=strategy, copy=True) | ||
| _base_check_imputation(impute_num_edata, edata_imputed) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) | ||
|
|
@@ -161,19 +187,55 @@ def test_simple_impute_copy(impute_num_edata, strategy): | |
| _base_check_imputation(impute_num_edata, edata_imputed) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("array_type", ARRAY_TYPES) | ||
| @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) | ||
| def test_simple_impute_subset(impute_edata, strategy): | ||
| def test_simple_impute_subset(impute_edata, array_type, strategy): | ||
| impute_edata.X = array_type(impute_edata.X) | ||
| var_names = ("intcol", "indexcol") | ||
| edata_imputed = simple_impute(impute_edata, var_names=var_names, copy=True) | ||
| if isinstance(impute_edata.X, da.Array) and strategy != "mean": | ||
| with pytest.raises(ValueError): | ||
| edata_imputed = simple_impute(impute_edata, var_names=var_names, strategy=strategy, copy=True) | ||
agerardy marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| else: | ||
| edata_imputed = simple_impute(impute_edata, var_names=var_names, strategy=strategy, copy=True) | ||
|
|
||
| _base_check_imputation(impute_edata, edata_imputed, imputed_var_names=var_names) | ||
| assert np.any([item != item for item in edata_imputed.X[::, 3:4]]) | ||
| _base_check_imputation(impute_edata, edata_imputed, imputed_var_names=var_names) | ||
| assert np.any([item != item for item in edata_imputed.X[::, 3:4]]) | ||
|
|
||
| # manually verified computation result | ||
| if strategy == "mean": | ||
| assert edata_imputed.X[0, 1] == 3.0 | ||
| elif strategy == "most_frequent": | ||
| assert edata_imputed.X[0, 1] == 2.0 # if multiple equally frequent values, return minimum | ||
|
|
||
| def test_simple_impute_3D_edata(edata_blob_small): | ||
| simple_impute(edata_blob_small, layer="layer_2") | ||
| with pytest.raises(ValueError, match=r"only supports 2D data"): | ||
| simple_impute(edata_blob_small, layer=DEFAULT_TEM_LAYER_NAME) | ||
|
|
||
| @pytest.mark.parametrize("array_type", ARRAY_TYPES) | ||
| @pytest.mark.parametrize("strategy", ["mean", "median", "most_frequent"]) | ||
| def test_simple_impute_3D_edata(mcar_edata, array_type, strategy): | ||
|
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Explanation: this test checks that for numeric data, the method can handle 3D data. There might be combinations of arguments that are not supported, which should be mentioned in the documentation if not obvious. |
||
| mcar_edata.layers[DEFAULT_TEM_LAYER_NAME] = array_type(mcar_edata.layers[DEFAULT_TEM_LAYER_NAME]) | ||
|
|
||
| if isinstance(mcar_edata.layers[DEFAULT_TEM_LAYER_NAME], da.Array) and strategy != "mean": | ||
| with pytest.raises(ValueError): | ||
| edata_imputed = simple_impute(mcar_edata, layer=DEFAULT_TEM_LAYER_NAME, strategy=strategy, copy=True) | ||
|
|
||
| else: | ||
| edata_imputed = simple_impute(mcar_edata, layer=DEFAULT_TEM_LAYER_NAME, strategy=strategy, copy=True) | ||
| _base_check_imputation( | ||
| mcar_edata, | ||
| edata_imputed, | ||
| before_imputation_layer=DEFAULT_TEM_LAYER_NAME, | ||
| after_imputation_layer=DEFAULT_TEM_LAYER_NAME, | ||
| ) | ||
|
|
||
| # manually verify computation result for 1 value | ||
| if strategy in {"mean", "median"}: | ||
| element = edata_imputed[9, 0, 0].layers[DEFAULT_TEM_LAYER_NAME] | ||
|
|
||
| if strategy == "mean": | ||
| reference_value = np.nanmean(mcar_edata[:, 0, :].layers[DEFAULT_TEM_LAYER_NAME]) | ||
| elif strategy == "median": | ||
| reference_value = np.nanmedian(mcar_edata[:, 0, :].layers[DEFAULT_TEM_LAYER_NAME]) | ||
|
|
||
| assert np.isclose(element, reference_value) | ||
|
|
||
|
|
||
| @pytest.mark.parametrize("strategy", ["mean", "median"]) | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.