|
78 | 78 | "name": "stderr", |
79 | 79 | "output_type": "stream", |
80 | 80 | "text": [ |
81 | | - "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s] \n", |
82 | | - "unzipping files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00, 1.60files/s]\n" |
| 81 | + "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s] \n", |
| 82 | + "unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00, 1.61files/s]\n" |
83 | 83 | ] |
84 | 84 | } |
85 | 85 | ], |
|
155 | 155 | "source": [ |
156 | 156 | "%%writefile \"./tf_trainer.py\"\n", |
157 | 157 | "\n", |
| 158 | + "import argparse\n", |
158 | 159 | "import os\n", |
159 | 160 | "\n", |
160 | 161 | "# the order of statements and imports is imoportant\n", |
|
180 | 181 | "\n", |
181 | 182 | "from merlin.loader.tensorflow import Loader\n", |
182 | 183 | "\n", |
| 184 | + "parser = argparse.ArgumentParser()\n", |
| 185 | + "parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n", |
| 186 | + "parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n", |
| 187 | + "args = parser.parse_args()\n", |
183 | 188 | "\n", |
184 | | - "DATA_PATH = os.getenv(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n", |
| 189 | + "DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n", |
| 190 | + "BATCH_SIZE = args.batch_size or 1024\n", |
185 | 191 | "\n", |
186 | 192 | "dataset = Dataset(os.path.join(DATA_PATH, \"train.parquet\"))\n", |
187 | 193 | "dataset = dataset.repartition(MPI_SIZE)\n", |
188 | 194 | "\n", |
189 | 195 | "loader = Loader(\n", |
190 | 196 | " dataset,\n", |
191 | | - " batch_size=64 * 1024,\n", |
| 197 | + " batch_size=BATCH_SIZE,\n", |
192 | 198 | " global_size=MPI_SIZE,\n", |
193 | 199 | " global_rank=MPI_RANK,\n", |
194 | 200 | " device=MPI_RANK,\n", |
|
280 | 286 | }, |
281 | 287 | { |
282 | 288 | "cell_type": "code", |
283 | | - "execution_count": 6, |
| 289 | + "execution_count": null, |
284 | 290 | "id": "ec5e9b7f", |
285 | 291 | "metadata": { |
286 | 292 | "scrolled": true |
|
290 | 296 | "name": "stdout", |
291 | 297 | "output_type": "stream", |
292 | 298 | "text": [ |
293 | | - "2023-06-03 21:35:18.892140: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
294 | | - "2023-06-03 21:35:18.932879: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", |
295 | | - "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
296 | | - "[1,1]<stderr>:2023-06-03 21:35:23.549563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
297 | | - "[1,0]<stderr>:2023-06-03 21:35:23.568539: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
298 | | - "[1,1]<stderr>:2023-06-03 21:35:23.592349: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", |
299 | | - "[1,1]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
300 | | - "[1,0]<stderr>:2023-06-03 21:35:23.609861: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", |
301 | | - "[1,0]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
302 | | - "[1,1]<stderr>:2023-06-03 21:35:28.092241: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n", |
303 | | - "[1,1]<stderr>:2023-06-03 21:35:28.092336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24337 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n", |
304 | | - "[1,0]<stderr>:2023-06-03 21:35:28.141988: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n", |
305 | | - "[1,0]<stderr>:2023-06-03 21:35:28.142076: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24338 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n", |
306 | | - "[1,0]<stderr>:2023-06-03 21:35:32.089463: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f1150020480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n", |
307 | | - "[1,0]<stderr>:2023-06-03 21:35:32.089532: I tensorflow/compiler/xla/service/service.cc:177] StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n", |
308 | | - "[1,1]<stderr>:2023-06-03 21:35:32.089552: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x79f8590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n", |
309 | | - "[1,1]<stderr>:2023-06-03 21:35:32.089613: I tensorflow/compiler/xla/service/service.cc:177] StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n", |
310 | | - "[1,1]<stderr>:2023-06-03 21:35:32.101885: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", |
311 | | - "[1,0]<stderr>:2023-06-03 21:35:32.102268: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n", |
312 | | - "[1,1]<stderr>:2023-06-03 21:35:33.637854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n", |
313 | | - "[1,0]<stderr>:2023-06-03 21:35:33.648275: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n", |
314 | | - "[1,1]<stderr>:2023-06-03 21:35:33.834015: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n", |
315 | | - "[1,0]<stderr>:2023-06-03 21:35:33.854743: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n", |
316 | | - "[1,0]<stdout>:Step #0\tLoss: 13.976147\n", |
317 | | - "[1,0]<stdout>:Step #10\tLoss: 13.746956\n", |
318 | | - "[1,0]<stdout>:Step #20\tLoss: 13.907515\n", |
319 | | - "[1,0]<stdout>:Step #30\tLoss: 14.084653\n", |
320 | | - "[1,0]<stdout>:Step #40\tLoss: 13.346972\n", |
321 | | - "[1,0]<stdout>:Step #50\tLoss: 13.931261\n", |
322 | | - "[1,0]<stdout>:Step #60\tLoss: 13.707795\n", |
323 | | - "[1,0]<stdout>:Step #70\tLoss: 13.510033\n", |
324 | | - "[1,0]<stdout>:Step #80\tLoss: 13.372274\n", |
325 | | - "[1,0]<stdout>:Step #90\tLoss: 13.713926\n", |
326 | | - "[1,0]<stdout>:Step #100\tLoss: 13.236437\n", |
327 | | - "[1,0]<stdout>:Step #110\tLoss: 13.265822\n", |
328 | | - "[1,0]<stdout>:Step #120\tLoss: 13.991277\n", |
329 | | - "[1,0]<stdout>:Step #130\tLoss: 14.069466\n", |
330 | | - "[1,0]<stdout>:Step #140\tLoss: 13.635876\n", |
331 | | - "[1,0]<stdout>:Step #150\tLoss: 13.416016\n", |
332 | | - "[1,0]<stdout>:Step #160\tLoss: 13.216636\n", |
333 | | - "[1,0]<stdout>:Step #170\tLoss: 12.776440\n", |
334 | | - "[1,0]<stdout>:Step #180\tLoss: 13.570569\n", |
335 | | - "[1,0]<stdout>:Step #190\tLoss: 13.868576\n", |
336 | | - "[1,1]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", |
337 | | - "[1,1]<stderr>: warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n", |
338 | | - "[1,0]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n", |
339 | | - "[1,0]<stderr>: warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n" |
| 299 | + "2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
| 300 | + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
| 301 | + "2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
| 302 | + "[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
| 303 | + "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
| 304 | + "[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: SSE3 SSE4.1 SSE4.2 AVX\n", |
| 305 | + "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", |
| 306 | + "[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
| 307 | + "[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", |
| 308 | + "[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", |
| 309 | + "[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n", |
| 310 | + "[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n", |
| 311 | + "[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory: -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n", |
| 312 | + "[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n", |
| 313 | + "[1,1]<stderr>:Instructions for updating:\n", |
| 314 | + "[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n", |
| 315 | + "[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n", |
| 316 | + "[1,0]<stderr>:Instructions for updating:\n", |
| 317 | + "[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n", |
| 318 | + "[1,0]<stdout>:Step #0\tLoss: 13.976286\n", |
| 319 | + "[1,0]<stdout>:Step #10\tLoss: 13.746111\n", |
| 320 | + "[1,0]<stdout>:Step #20\tLoss: 13.905323\n", |
| 321 | + "[1,0]<stdout>:Step #30\tLoss: 14.093473\n", |
| 322 | + "[1,0]<stdout>:Step #40\tLoss: 13.336206\n", |
| 323 | + "[1,0]<stdout>:Step #50\tLoss: 13.932583\n", |
| 324 | + "[1,0]<stdout>:Step #60\tLoss: 13.702780\n", |
| 325 | + "[1,0]<stdout>:Step #70\tLoss: 13.522057\n", |
| 326 | + "[1,0]<stdout>:Step #80\tLoss: 13.382860\n", |
| 327 | + "[1,0]<stdout>:Step #90\tLoss: 13.701270\n", |
| 328 | + "[1,0]<stdout>:Step #100\tLoss: 13.240610\n", |
| 329 | + "[1,0]<stdout>:Step #110\tLoss: 13.264977\n", |
| 330 | + "[1,0]<stdout>:Step #120\tLoss: 13.984927\n", |
| 331 | + "[1,0]<stdout>:Step #130\tLoss: 14.039978\n", |
| 332 | + "[1,0]<stdout>:Step #140\tLoss: 13.639907\n", |
| 333 | + "[1,0]<stdout>:Step #150\tLoss: 13.430090\n", |
| 334 | + "[1,0]<stdout>:Step #160\tLoss: 13.219415\n", |
| 335 | + "[1,0]<stdout>:Step #170\tLoss: 12.758451\n", |
| 336 | + "[1,0]<stdout>:Step #180\tLoss: 13.592442\n" |
340 | 337 | ] |
341 | 338 | } |
342 | 339 | ], |
343 | 340 | "source": [ |
344 | | - "!horovodrun -np {GPU_COUNT} python tf_trainer.py" |
| 341 | + "!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536" |
345 | 342 | ] |
346 | 343 | }, |
347 | 344 | { |
|
0 commit comments