Project import generated by Copybara. (#42)

snowflake-provisioner · web-flow · commit 6f23e5978b08 · 2023-09-05T13:05:47.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,18 @@
 # Release History
 
-## 1.0.6
+## 1.0.7
+
+### Behavior Changes
+
+
+### New Features
+
+
+### Bug Fixes
+
+- Model Development & Model Registry: Fix an error related to `pandas.io.json.json_normalize`.
+
+## 1.0.6 (2023-09-01)
 
 ### New Features
 - Model Registry: add `create_if_not_exists` parameter in constructor.
diff --git a/bazel/environments/conda-env-snowflake.yml b/bazel/environments/conda-env-snowflake.yml
@@ -16,7 +16,7 @@ dependencies:
 - cryptography==39.0.1
 - flask-cors==3.0.10
 - flask==2.1.3
-- fsspec==2022.11.0
+- fsspec==2023.3.0
 - httpx==0.23.0
 - inflection==0.5.1
 - joblib==1.1.1
@@ -37,7 +37,7 @@ dependencies:
 - pyyaml==6.0
 - requests==2.29.0
 - ruamel.yaml==0.17.21
-- s3fs==2022.11.0
+- s3fs==2023.3.0
 - scikit-learn==1.3.0
 - scipy==1.9.3
 - snowflake-connector-python==3.0.3
diff --git a/bazel/environments/conda-env.yml b/bazel/environments/conda-env.yml
@@ -19,7 +19,7 @@ dependencies:
 - cryptography==39.0.1
 - flask-cors==3.0.10
 - flask==2.1.3
-- fsspec==2022.11.0
+- fsspec==2023.3.0
 - httpx==0.23.0
 - inflection==0.5.1
 - joblib==1.1.1
@@ -41,7 +41,7 @@ dependencies:
 - pyyaml==6.0
 - requests==2.29.0
 - ruamel.yaml==0.17.21
-- s3fs==2022.11.0
+- s3fs==2023.3.0
 - scikit-learn==1.3.0
 - scipy==1.9.3
 - snowflake-connector-python==3.0.3
diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.0.6
+  version: 1.0.7
 requirements:
   build:
     - python
@@ -27,13 +27,14 @@ requirements:
     - aiohttp!=4.0.0a0, !=4.0.0a1
     - anyio>=3.5.0,<4
     - cloudpickle
-    - fsspec>=2022.11,<=2023.1
+    - fsspec>=2022.11,<2024
     - numpy>=1.23,<2
     - packaging>=20.9,<24
     - pandas>=1.0.0,<2
-    - python
+    - python>=3.8.13, <3.11
     - pyyaml>=6.0,<7
     - requests
+    - s3fs>=2022.11,<2024
     - scikit-learn>=1.2.1,<1.4
     - scipy>=1.9,<2
     - snowflake-connector-python>=3.0.3,<4
diff --git a/codegen/sklearn_wrapper_template.py_template b/codegen/sklearn_wrapper_template.py_template
@@ -476,9 +476,9 @@ class {transform.original_class_name}(BaseTransformer):
             import pandas as pd
             import numpy as np
 
-            input_df = pd.io.json.json_normalize(ds)
+            input_df = pd.json_normalize(ds)
 
-            # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
+            # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
             # But trained models have unquoted input column names saved in internal state if trained using snowpark_df
             # or quoted input column names saved in internal state if trained using pandas_df.
             # Model expects exact same columns names in the input df for predict call.
diff --git a/requirements.yml b/requirements.yml
@@ -101,8 +101,8 @@
   dev_version: "2.1.3"
 - name_pypi: fsspec[http]
   name_conda: fsspec
-  dev_version: "2022.11.0"
-  version_requirements: ">=2022.11,<=2023.1"
+  dev_version: "2023.3.0"
+  version_requirements: ">=2022.11,<2024"
 - name: httpx
   dev_version: "0.23.0"
 - name: inflection
@@ -158,7 +158,7 @@
   dev_version: "7.1.2"
 - name_conda: python
   dev_version_conda: "3.8.13"
-  version_requirements_conda: ""
+  version_requirements_conda: ">=3.8.13, <3.11"
 - name_pypi: torch
   name_conda: pytorch
   dev_version: "2.0.1"
@@ -175,7 +175,8 @@
 - name: ruamel.yaml
   dev_version: "0.17.21"
 - name: s3fs
-  dev_version: "2022.11.0"
+  dev_version: "2023.3.0"
+  version_requirements: ">=2022.11,<2024"
 - name: scikit-learn
   dev_version: "1.3.0"
   version_requirements: ">=1.2.1,<1.4"
diff --git a/snowflake/ml/fileset/stage_fs_test.py b/snowflake/ml/fileset/stage_fs_test.py
@@ -2,11 +2,7 @@
 from typing import Dict, List
 
 import boto3
-
-# library `requests` has known stubs but is not installed.
-# TODO(zpeng): we may need to install as many mypy stubs as possible. However that
-# would require installing mypy when initializing the bazel conda environment.
-import requests  # type: ignore
+import requests
 import stage_fs
 from absl.testing import absltest
 from moto import server
diff --git a/snowflake/ml/model/_deploy_client/warehouse/infer_template.py b/snowflake/ml/model/_deploy_client/warehouse/infer_template.py
@@ -52,7 +52,7 @@ def __exit__(self, type, value, traceback):
 # TODO(halu): Avoid per batch async detection branching.
 @vectorized(input=pd.DataFrame, max_batch_size=10)
 def infer(df):
-    input_df = pd.io.json.json_normalize(df[0]).astype(dtype=dtype_map)
+    input_df = pd.json_normalize(df[0]).astype(dtype=dtype_map)
     if inspect.iscoroutinefunction(model.{target_method}):
         predictions_df = anyio.run(model.{target_method}, input_df[input_cols])
     else:
diff --git a/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_grid_search_cv.py
@@ -493,7 +493,6 @@ def _fit_snowpark(self, dataset: DataFrame) -> None:
         ]
         target_locations = []
         for param_chunk in param_chunks:
-
             param_chunk_dist: Any = defaultdict(set)
             for d in param_chunk:
                 for k, v in d.items():
@@ -675,9 +674,9 @@ def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]:  # type: igno
             import numpy as np
             import pandas as pd
 
-            input_df = pd.io.json.json_normalize(ds)
+            input_df = pd.json_normalize(ds)
 
-            # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
+            # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
             # But trained models have unquoted input column names saved in internal state if trained using snowpark_df
             # or quoted input column names saved in internal state if trained using pandas_df.
             # Model expects exact same columns names in the input df for predict call.
diff --git a/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py b/snowflake/ml/modeling/model_selection/_internal/_randomized_search_cv.py
@@ -503,7 +503,6 @@ def _fit_snowpark(self, dataset: DataFrame) -> None:
         ]
         target_locations = []
         for param_chunk in param_chunks:
-
             param_chunk_dist: Any = defaultdict(set)
             for d in param_chunk:
                 for k, v in d.items():
@@ -684,9 +683,9 @@ def vec_batch_infer(ds: PandasSeries[dict]) -> PandasSeries[dict]:  # type: igno
             import numpy as np
             import pandas as pd
 
-            input_df = pd.io.json.json_normalize(ds)
+            input_df = pd.json_normalize(ds)
 
-            # pd.io.json.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
+            # pd.json_normalize() doesn't remove quotes around quoted identifiers like snowpakr_df.to_pandas().
             # But trained models have unquoted input column names saved in internal state if trained using snowpark_df
             # or quoted input column names saved in internal state if trained using pandas_df.
             # Model expects exact same columns names in the input df for predict call.
diff --git a/snowflake/ml/requirements.bzl b/snowflake/ml/requirements.bzl
@@ -3,4 +3,4 @@
 
 EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<2.4'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'transformers': ['transformers>=4.29.2,<5'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<2.4', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1', 'transformers>=4.29.2,<5']}
 
-REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2']
+REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<2024', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 's3fs>=2022.11,<2024', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2']
diff --git a/snowflake/ml/version.bzl b/snowflake/ml/version.bzl
@@ -1,2 +1,2 @@
 # This is parsed by regex in conda reciper meta file. Make sure not to break it.
-VERSION = "1.0.6"
+VERSION = "1.0.7"
diff --git a/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py b/tests/integ/snowflake/ml/model/warehouse_snowml_model_integ_test.py
@@ -77,7 +77,7 @@ def base_test_case(
             test_released_version=test_released_version,
         )
 
-    @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"])  # type: ignore[misc]
+    @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.7"])  # type: ignore[misc]
     def test_snowml_model_deploy_snowml_sklearn(
         self,
         permanent_deploy: Optional[bool] = False,
@@ -110,7 +110,7 @@ def test_snowml_model_deploy_snowml_sklearn(
             test_released_version=test_released_version,
         )
 
-    @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"])  # type: ignore[misc]
+    @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.7"])  # type: ignore[misc]
     def test_snowml_model_deploy_xgboost(
         self,
         permanent_deploy: Optional[bool] = False,
@@ -143,7 +143,7 @@ def test_snowml_model_deploy_xgboost(
             test_released_version=test_released_version,
         )
 
-    @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.5"])  # type: ignore[misc]
+    @parameterized.product(permanent_deploy=[True, False], test_released_version=[None, "1.0.7"])  # type: ignore[misc]
     def test_snowml_model_deploy_lightgbm(
         self,
         permanent_deploy: Optional[bool] = False,
diff --git a/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py b/tests/integ/snowflake/ml/modeling/pipeline/test_pipeline.py
@@ -124,45 +124,47 @@ def test_serde(self) -> None:
         mms = MinMaxScaler(input_cols=output_cols, output_cols=pipeline_output_cols)
         pipeline = snowml_pipeline.Pipeline([("ss", ss), ("mms", mms)])
         pipeline.fit(df1)
-        filepath = os.path.join(tempfile.gettempdir(), "test_pipeline.pkl")
-        self._to_be_deleted_files.append(filepath)
-        pipeline_dump_cloudpickle = cloudpickle.dumps(pipeline)
-        pipeline_dump_pickle = pickle.dumps(pipeline)
-        joblib.dump(pipeline, filepath)
+        with tempfile.NamedTemporaryFile(suffix=".pkl", delete=False) as file:
+            self._to_be_deleted_files.append(file.name)
+            pipeline_dump_cloudpickle = cloudpickle.dumps(pipeline)
+            pipeline_dump_pickle = pickle.dumps(pipeline)
+            joblib.dump(pipeline, file.name)
+
+            self._session.close()
+
+            # transform in session 2
+            self._session = Session.builder.configs(SnowflakeLoginOptions()).create()
+            _, df2 = framework_utils.get_df(self._session, data, schema, np.nan)
+            input_cols_extended = input_cols.copy()
+            input_cols_extended.append(id_col)
+
+            importlib.reload(sys.modules["snowflake.ml.modeling.pipeline"])
+
+            # cloudpickle
+            pipeline_load_cloudpickle = cloudpickle.loads(pipeline_dump_cloudpickle)
+            transformed_df_cloudpickle = pipeline_load_cloudpickle.transform(df2[input_cols_extended])
+            actual_arr_cloudpickle = (
+                transformed_df_cloudpickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy()
+            )
 
-        self._session.close()
+            # pickle
+            pipeline_load_pickle = pickle.loads(pipeline_dump_pickle)
+            transformed_df_pickle = pipeline_load_pickle.transform(df2[input_cols_extended])
+            actual_arr_pickle = transformed_df_pickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy()
 
-        # transform in session 2
-        self._session = Session.builder.configs(SnowflakeLoginOptions()).create()
-        _, df2 = framework_utils.get_df(self._session, data, schema, np.nan)
-        input_cols_extended = input_cols.copy()
-        input_cols_extended.append(id_col)
-
-        importlib.reload(sys.modules["snowflake.ml.modeling.pipeline"])
-
-        # cloudpickle
-        pipeline_load_cloudpickle = cloudpickle.loads(pipeline_dump_cloudpickle)
-        transformed_df_cloudpickle = pipeline_load_cloudpickle.transform(df2[input_cols_extended])
-        actual_arr_cloudpickle = transformed_df_cloudpickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy()
-
-        # pickle
-        pipeline_load_pickle = pickle.loads(pipeline_dump_pickle)
-        transformed_df_pickle = pipeline_load_pickle.transform(df2[input_cols_extended])
-        actual_arr_pickle = transformed_df_pickle.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy()
-
-        # joblib
-        pipeline_load_joblib = joblib.load(filepath)
-        transformed_df_joblib = pipeline_load_joblib.transform(df2[input_cols_extended])
-        actual_arr_joblib = transformed_df_joblib.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy()
-
-        # sklearn
-        skpipeline = SkPipeline([("ss", SklearnStandardScaler()), ("mms", SklearnMinMaxScaler())])
-        skpipeline.fit(df_pandas[input_cols])
-        sklearn_arr = skpipeline.transform(df_pandas[input_cols])
-
-        assert np.allclose(actual_arr_cloudpickle, sklearn_arr)
-        assert np.allclose(actual_arr_pickle, sklearn_arr)
-        assert np.allclose(actual_arr_joblib, sklearn_arr)
+            # joblib
+            pipeline_load_joblib = joblib.load(file.name)
+            transformed_df_joblib = pipeline_load_joblib.transform(df2[input_cols_extended])
+            actual_arr_joblib = transformed_df_joblib.sort(id_col)[pipeline_output_cols].to_pandas().to_numpy()
+
+            # sklearn
+            skpipeline = SkPipeline([("ss", SklearnStandardScaler()), ("mms", SklearnMinMaxScaler())])
+            skpipeline.fit(df_pandas[input_cols])
+            sklearn_arr = skpipeline.transform(df_pandas[input_cols])
+
+            np.testing.assert_allclose(actual_arr_cloudpickle, sklearn_arr)
+            np.testing.assert_allclose(actual_arr_pickle, sklearn_arr)
+            np.testing.assert_allclose(actual_arr_joblib, sklearn_arr)
 
     def test_pipeline_with_regression_estimators(self) -> None:
         input_df_pandas = load_diabetes(as_frame=True).frame
diff --git a/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl b/tests/integ/snowflake/ml/modeling/preprocessing/BUILD_NATIVE.bzl
@@ -122,6 +122,8 @@ def get_build_rules_for_native_impl():
     py_test(
         name = "test_drop_input_cols",
         srcs = ["test_drop_input_cols.py"],
+        shard_count = SHARD_COUNT,
+        timeout = TIMEOUT,
         deps = [
             "//snowflake/ml/modeling/impute:simple_imputer",
             "//snowflake/ml/modeling/pipeline:pipeline",
diff --git a/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py b/tests/integ/snowflake/ml/registry/model_registry_integ_test_snowservice_merge_gate.py
@@ -2,40 +2,40 @@
 # Copyright (c) 2012-2022 Snowflake Computing Inc. All rights reserved.
 #
 
-import uuid
+# import uuid
 
-import pandas as pd
-import pytest
+# import pandas as pd
+# import pytest
 from absl.testing import absltest
 
-from snowflake.ml.model import deploy_platforms
-from tests.integ.snowflake.ml.registry.model_registry_integ_test_snowservice_base import (
-    TestModelRegistryIntegSnowServiceBase,
-)
-from tests.integ.snowflake.ml.test_utils import model_factory
+# from snowflake.ml.model import deploy_platforms
+# from tests.integ.snowflake.ml.registry.model_registry_integ_test_snowservice_base import (
+#     TestModelRegistryIntegSnowServiceBase,
+# )
+# from tests.integ.snowflake.ml.test_utils import model_factory
 
 
-class TestModelRegistryIntegWithSnowServiceDeployment(TestModelRegistryIntegSnowServiceBase):
-    @pytest.mark.pip_incompatible
-    def test_snowml_model_deployment_xgboost(self) -> None:
-        self._test_snowservice_deployment(
-            model_name="xgboost_model",
-            model_version=uuid.uuid4().hex,
-            prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_xgb,
-            prediction_assert_fn=lambda local_prediction, remote_prediction: pd.testing.assert_frame_equal(
-                remote_prediction, local_prediction, check_dtype=False
-            ),
-            deployment_options={
-                "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,
-                "target_method": "predict",
-                "options": {
-                    "compute_pool": self._TEST_CPU_COMPUTE_POOL,
-                    "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO),
-                    "enable_remote_image_build": True,
-                },
-            },
-            omit_target_method_when_deploy=True,
-        )
+# class TestModelRegistryIntegWithSnowServiceDeployment(TestModelRegistryIntegSnowServiceBase):
+#     @pytest.mark.pip_incompatible
+#     def test_snowml_model_deployment_xgboost(self) -> None:
+#         self._test_snowservice_deployment(
+#             model_name="xgboost_model",
+#             model_version=uuid.uuid4().hex,
+#             prepare_model_and_feature_fn=model_factory.ModelFactory.prepare_snowml_model_xgb,
+#             prediction_assert_fn=lambda local_prediction, remote_prediction: pd.testing.assert_frame_equal(
+#                 remote_prediction, local_prediction, check_dtype=False
+#             ),
+#             deployment_options={
+#                 "platform": deploy_platforms.TargetPlatform.SNOWPARK_CONTAINER_SERVICES,
+#                 "target_method": "predict",
+#                 "options": {
+#                     "compute_pool": self._TEST_CPU_COMPUTE_POOL,
+#                     "image_repo": self._db_manager.get_snowservice_image_repo(repo=self._TEST_IMAGE_REPO),
+#                     "enable_remote_image_build": True,
+#                 },
+#             },
+#             omit_target_method_when_deploy=True,
+#         )
 
 
 if __name__ == "__main__":

Original file line number	Diff line number	Diff line change
`@@ -3,4 +3,4 @@`
`3`	`3`
`4`	`4`	`EXTRA_REQUIREMENTS={'lightgbm': ['lightgbm==3.3.5'], 'mlflow': ['mlflow>=2.1.0,<2.4'], 'tensorflow': ['tensorflow>=2.9,<3'], 'torch': ['torchdata>=0.4,<1'], 'transformers': ['transformers>=4.29.2,<5'], 'all': ['lightgbm==3.3.5', 'mlflow>=2.1.0,<2.4', 'tensorflow>=2.9,<3', 'torchdata>=0.4,<1', 'transformers>=4.29.2,<5']}`
`5`	`5`
`6`		`-REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<=2023.1', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2']`
	`6`	`+REQUIREMENTS=['absl-py>=0.15,<2', 'anyio>=3.5.0,<4', 'cloudpickle', 'fsspec[http]>=2022.11,<2024', 'numpy>=1.23,<2', 'packaging>=20.9,<24', 'pandas>=1.0.0,<2', 'pyyaml>=6.0,<7', 's3fs>=2022.11,<2024', 'scikit-learn>=1.2.1,<1.4', 'scipy>=1.9,<2', 'snowflake-connector-python[pandas]>=3.0.3,<4', 'snowflake-snowpark-python>=1.5.1,<2', 'sqlparse>=0.4,<1', 'typing-extensions>=4.1.0,<5', 'xgboost>=1.7.3,<2']`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`# This is parsed by regex in conda reciper meta file. Make sure not to break it.`
`2`		`-VERSION = "1.0.6"`
	`2`	`+VERSION = "1.0.7"`