|
51 | 51 | "# External dependencies\n", |
52 | 52 | "import os\n", |
53 | 53 | "\n", |
54 | | - "from sklearn.model_selection import train_test_split\n", |
55 | | - "\n", |
56 | 54 | "from nvtabular.utils import download_file\n", |
57 | 55 | "\n", |
58 | 56 | "# Get dataframe library - cudf or pandas\n", |
|
89 | 87 | "cell_type": "code", |
90 | 88 | "execution_count": 4, |
91 | 89 | "metadata": {}, |
92 | | - "outputs": [], |
| 90 | + "outputs": [ |
| 91 | + { |
| 92 | + "name": "stderr", |
| 93 | + "output_type": "stream", |
| 94 | + "text": [ |
| 95 | + "downloading ml-25m.zip: 262MB [00:06, 42.1MB/s] \n", |
| 96 | + "unzipping files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00, 1.74files/s]\n" |
| 97 | + ] |
| 98 | + } |
| 99 | + ], |
93 | 100 | "source": [ |
94 | 101 | "download_file(\n", |
95 | 102 | " \"http://files.grouplens.org/datasets/movielens/ml-25m.zip\",\n", |
|
415 | 422 | "cell_type": "markdown", |
416 | 423 | "metadata": {}, |
417 | 424 | "source": [ |
418 | | - "We drop the timestamp column and split the ratings into training and test dataset. We use a simple random split." |
| 425 | + "We drop the timestamp column and split the ratings into training and test datasets. We use a simple random split." |
419 | 426 | ] |
420 | 427 | }, |
421 | 428 | { |
|
425 | 432 | "outputs": [], |
426 | 433 | "source": [ |
427 | 434 | "ratings = ratings.drop(\"timestamp\", axis=1)\n", |
428 | | - "# convert ratings to pandas df to use sklearn train_test_split func\n", |
429 | | - "ratings = ratings.to_pandas()\n", |
430 | | - "train, valid = train_test_split(ratings, test_size=0.2, random_state=42)" |
| 435 | + "\n", |
| 436 | + "# shuffle the dataset\n", |
| 437 | + "ratings = ratings.sample(len(ratings), replace=False)\n", |
| 438 | + "\n", |
| 439 | + "# split the train_df as training and validation data sets.\n", |
| 440 | + "num_valid = int(len(ratings) * 0.2)\n", |
| 441 | + "\n", |
| 442 | + "train = ratings[:-num_valid]\n", |
| 443 | + "valid = ratings[-num_valid:]" |
431 | 444 | ] |
432 | 445 | }, |
433 | 446 | { |
|
0 commit comments