@@ -16,10 +16,10 @@ import pytest
16
16
from typing import Any, Dict, List, Optional, Tuple, Union
17
17
from absl.testing.absltest import TestCase, main
18
18
{transform.test_snowpark_pandas_imports}
19
- # from snowflake.ml.beta import snowpark_pandas
19
+ # from snowflake.ml import snowpark_pandas
20
20
from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
21
21
from snowflake.snowpark import Session
22
- # from snowflake.snowpark.modin import pandas as SnowparkPandas
22
+ # from snowflake.snowpark.modin import pandas as snowpark_pandas
23
23
24
24
_INFERENCE = "INFERENCE"
25
25
_EXPECTED = "EXPECTED"
@@ -35,7 +35,7 @@ class DatasetType(enum.Enum):
35
35
class {transform.test_class_name}(TestCase):
36
36
def setUp(self) -> None:
37
37
"""Creates Snowpark and Snowflake environments for testing."""
38
- self._session = Session.builder.configs(SnowflakeLoginOptions("sfc" )).create()
38
+ self._session = Session.builder.configs(SnowflakeLoginOptions()).create()
39
39
40
40
def tearDown(self) -> None:
41
41
self._session.close()
@@ -114,12 +114,12 @@ class {transform.test_class_name}(TestCase):
114
114
# inference_methods.remove("transform") # underlying estimators have no method 'transform'
115
115
# if Sk{transform.original_class_name}.__name__ == "LocalOutlierFactor" and not reg.novelty:
116
116
# inference_methods.remove("predict")
117
-
117
+
118
118
# for m in inference_methods:
119
119
# if callable(getattr(reg, m, None)):
120
120
# res = getattr(reg, m)(dataset)
121
- # TODO(hayu): Remove the output manipulation as the results should be exactly the same as sklearn.
122
- # if isinstance(res, SnowparkPandas .DataFrame) or isinstance(res, pd.DataFrame):
121
+ # # TODO(hayu): Remove the output manipulation as the results should be exactly the same as sklearn.
122
+ # if isinstance(res, snowpark_pandas .DataFrame) or isinstance(res, pd.DataFrame):
123
123
# arr = res.to_numpy()
124
124
# elif isinstance(res, list):
125
125
# arr = np.array(res)
@@ -128,14 +128,14 @@ class {transform.test_class_name}(TestCase):
128
128
# if arr.ndim == 2 and arr.shape[1] == 1:
129
129
# arr = arr.flatten()
130
130
# if len(arr.shape) == 3:
131
- # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes)
132
- # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms,
133
- # so we ignore flatten_transform flag and flatten the results. We need flatten sklearn results
134
- # also to compare with snowflake results.
131
+ # # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes)
132
+ # # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms,
133
+ # # so we ignore flatten_transform flag and flatten the results. We need flatten sklearn results
134
+ # # also to compare with snowflake results.
135
135
# arr = np.hstack(arr) # type: ignore[arg-type]
136
136
# elif len(arr.shape) == 1:
137
- # Sometimes sklearn returns results as 1D array of shape (n_samples,), but snowflake always returns
138
- # response as 2D array of shape (n_samples, 1). Flatten the snowflake response to compare results.
137
+ # # Sometimes sklearn returns results as 1D array of shape (n_samples,), but snowflake always returns
138
+ # # response as 2D array of shape (n_samples, 1). Flatten the snowflake response to compare results.
139
139
# arr = arr.flatten()
140
140
# output[_INFERENCE].append(arr)
141
141
@@ -152,7 +152,7 @@ class {transform.test_class_name}(TestCase):
152
152
# for m in expected_methods:
153
153
# if callable(getattr(reg, m, None)):
154
154
# res = getattr(reg, m)(dataset)
155
- # if isinstance(res, SnowparkPandas .DataFrame) or isinstance(res, pd.DataFrame):
155
+ # if isinstance(res, snowpark_pandas .DataFrame) or isinstance(res, pd.DataFrame):
156
156
# arr = res.to_numpy()
157
157
# elif isinstance(res, list):
158
158
# arr = np.array(res)
@@ -161,8 +161,8 @@ class {transform.test_class_name}(TestCase):
161
161
# if arr.ndim == 2 and arr.shape[1] == 1:
162
162
# arr = arr.flatten()
163
163
# if isinstance(arr, list):
164
- # In case of multioutput estimators predict_proba, decision_function, etc., returns a list of
165
- # ndarrays as output. We need to concatenate them to compare with snowflake output.
164
+ # # In case of multioutput estimators predict_proba, decision_function, etc., returns a list of
165
+ # # ndarrays as output. We need to concatenate them to compare with snowflake output.
166
166
# arr = np.concatenate(arr, axis=1)
167
167
# elif len(arr.shape) == 1:
168
168
# # Sometimes sklearn returns results as 1D array of shape (n_samples,), but snowflake always returns
@@ -189,14 +189,18 @@ class {transform.test_class_name}(TestCase):
189
189
190
190
# reg = Sk{transform.original_class_name}({transform.test_estimator_input_args})
191
191
192
+ # # Special handle for label encoder: sklearn label encoder fit method only accept fit(y),
193
+ # # but our SnowML API would treat it as fit(X)
194
+ # _is_label_encoder = reg.__class__.__name__ == "LabelEncoder"
195
+
192
196
# input_df_pandas, input_cols, label_col = self._get_test_dataset(
193
197
# sklearn_obj=reg,
194
198
# add_sample_weight_col=use_weighted_dataset
195
199
# )
196
- # input_df_snowpark_pandas = SnowparkPandas .DataFrame(input_df_pandas)
200
+ # input_df_snowpandas = snow_pd .DataFrame(input_df_pandas)
197
201
198
202
# pd_X, pd_y = input_df_pandas[input_cols], input_df_pandas[label_col].squeeze()
199
- # snow_X, snow_y = input_df_snowpark_pandas [input_cols], input_df_snowpark_pandas [label_col].squeeze()
203
+ # snow_X, snow_y = input_df_snowpandas [input_cols], input_df_snowpandas [label_col].squeeze()
200
204
# pd_args = {{
201
205
# 'X': pd_X,
202
206
# 'y': pd_y,
@@ -205,21 +209,23 @@ class {transform.test_class_name}(TestCase):
205
209
# 'X': snow_X,
206
210
# 'y': snow_y,
207
211
# }}
208
- # if use_weighted_dataset:
212
+
213
+ # # SnowML preprocessing class currently doesn't support sample weight
214
+ # if use_weighted_dataset and not {transform._is_preprocessing_module_obj}:
209
215
# pd_args['sample_weight'] = input_df_pandas["SAMPLE_WEIGHT"].squeeze()
210
- # snow_args['sample_weight'] = input_df_snowpark_pandas ["SAMPLE_WEIGHT"].squeeze()
216
+ # snow_args['sample_weight'] = input_df_snowpandas ["SAMPLE_WEIGHT"].squeeze()
211
217
212
218
# pd_score_args = snow_score_args = None
213
219
# if callable(getattr(reg, "score", None)):
214
220
# pd_score_args = copy.deepcopy(pd_args)
215
221
# snow_score_args = copy.deepcopy(snow_args)
216
222
# score_argspec = inspect.getfullargspec(reg.score)
217
- # Some classes that has sample_weight argument in fit() but not in score().
223
+ # # Some classes that has sample_weight argument in fit() but not in score().
218
224
# if use_weighted_dataset and 'sample_weight' not in score_argspec.args:
219
225
# del pd_score_args['sample_weight']
220
226
# del snow_score_args['sample_weight']
221
227
222
- # Some classes have different arg name in score: X -> X_test
228
+ # # Some classes have different arg name in score: X -> X_test
223
229
# if "X_test" in score_argspec.args:
224
230
# pd_score_args['X_test'] = pd_score_args.pop('X')
225
231
# snow_score_args['X_test'] = snow_score_args.pop('X')
@@ -229,24 +235,34 @@ class {transform.test_class_name}(TestCase):
229
235
# pd_args['Y'] = pd_args.pop('y')
230
236
# snow_args['Y'] = snow_args.pop('y')
231
237
232
- # pandas
233
- # pd_output = self._compute_output(reg, pd_args, input_df_pandas[input_cols], pd_score_args)
238
+ # # pandas
239
+ # if _is_label_encoder:
240
+ # pd_output = self._compute_output(reg, {{'y': input_df_pandas[label_col]}}, input_df_pandas[label_col], None)
241
+ # else:
242
+ # pd_output = self._compute_output(reg, pd_args, input_df_pandas[input_cols], pd_score_args)
234
243
235
- # snowpark_pandas
244
+ # # snowpandas
236
245
# snowpark_pandas.init()
237
246
247
+ # # Integrate with native distributed preprocessing methods
238
248
# snow_reg = Sk{transform.original_class_name}({transform.test_estimator_input_args})
239
249
# args = snow_args if training == DatasetType.SNOWPARK_PANDAS else pd_args
240
250
# dataset, score_args = (
241
- # (input_df_snowpark_pandas [input_cols], snow_score_args) if inference == DatasetType.SNOWPARK_PANDAS
251
+ # (input_df_snowpandas [input_cols], snow_score_args) if inference == DatasetType.SNOWPARK_PANDAS
242
252
# else (input_df_pandas[input_cols], pd_score_args)
243
253
# )
244
- # snow_output = self._compute_output(snow_reg, args, dataset, score_args)
254
+ # if _is_label_encoder:
255
+ # if training == DatasetType.SNOWPARK_PANDAS:
256
+ # snow_output = self._compute_output(reg, {{'X': input_df_snowpandas[label_col]}}, input_df_snowpandas[label_col], None)
257
+ # else:
258
+ # snow_output = self._compute_output(reg, {{'y': input_df_pandas[label_col]}}, input_df_pandas[label_col], None)
259
+ # else:
260
+ # snow_output = self._compute_output(snow_reg, args, dataset, score_args)
245
261
246
262
# for pd_arr, snow_arr in zip(pd_output[_INFERENCE], snow_output[_INFERENCE]):
247
263
# snow_arr = snow_arr.astype(pd_arr.dtype) # type: ignore[union-attr]
248
- # TODO(snandamuri): HistGradientBoostingRegressor is returning different results in different envs.
249
- # Needs further debugging.
264
+ # # TODO(snandamuri): HistGradientBoostingRegressor is returning different results in different envs.
265
+ # # Needs further debugging.
250
266
# if {transform._is_hist_gradient_boosting_regressor}:
251
267
# num_diffs = (~np.isclose(snow_arr, pd_arr)).sum()
252
268
# num_example = pd_arr.shape[0]
@@ -282,13 +298,13 @@ class {transform.test_class_name}(TestCase):
282
298
# use_weighted_dataset=False
283
299
# )
284
300
285
- def _is_weighted_dataset_supported(self, klass: type) -> bool:
286
- is_weighted_dataset_supported = False
287
- for m in inspect.getmembers(klass):
288
- if inspect.isfunction(m[1]) and m[0] == "fit":
289
- argspec = inspect.getfullargspec(m[1])
290
- is_weighted_dataset_supported = True if "sample_weight" in argspec.args else False
291
- return is_weighted_dataset_supported
301
+ # def _is_weighted_dataset_supported(self, klass: type) -> bool:
302
+ # is_weighted_dataset_supported = False
303
+ # for m in inspect.getmembers(klass):
304
+ # if inspect.isfunction(m[1]) and m[0] == "fit":
305
+ # argspec = inspect.getfullargspec(m[1])
306
+ # is_weighted_dataset_supported = True if "sample_weight" in argspec.args else False
307
+ # return is_weighted_dataset_supported
292
308
293
309
# def test_weighted_datasets_snow_snow(self) -> None:
294
310
# if self._is_weighted_dataset_supported(Sk{transform.original_class_name}):
0 commit comments