NVIDIA-Merlin
diff --git a/‎examples/getting-started-movielens/01-Download-Convert.ipynb‎
Lines changed: 20 additions & 7 deletions b/‎examples/getting-started-movielens/01-Download-Convert.ipynb‎
Lines changed: 20 additions & 7 deletions
@@ -51,8 +51,6 @@
     "# External dependencies\n",
     "import os\n",
     "\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
     "from nvtabular.utils import download_file\n",
     "\n",
     "# Get dataframe library - cudf or pandas\n",
@@ -89,7 +87,16 @@
    "cell_type": "code",
    "execution_count": 4,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "downloading ml-25m.zip: 262MB [00:06, 42.1MB/s]                                                                                                                                            \n",
+      "unzipping files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.74files/s]\n"
+     ]
+    }
+   ],
    "source": [
     "download_file(\n",
     "    \"http://files.grouplens.org/datasets/movielens/ml-25m.zip\",\n",
@@ -415,7 +422,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We drop the timestamp column and split the ratings into training and test dataset. We use a simple random split."
+    "We drop the timestamp column and split the ratings into training and test datasets. We use a simple random split."
    ]
   },
   {
@@ -425,9 +432,15 @@
    "outputs": [],
    "source": [
     "ratings = ratings.drop(\"timestamp\", axis=1)\n",
-    "# convert ratings to pandas df to use sklearn train_test_split func\n",
-    "ratings = ratings.to_pandas()\n",
-    "train, valid = train_test_split(ratings, test_size=0.2, random_state=42)"
+    "\n",
+    "# shuffle the dataset\n",
+    "ratings = ratings.sample(len(ratings), replace=False)\n",
+    "\n",
+    "# split the train_df as training and validation data sets.\n",
+    "num_valid = int(len(ratings) * 0.2)\n",
+    "\n",
+    "train = ratings[:-num_valid]\n",
+    "valid = ratings[-num_valid:]"
    ]
   },
   {