Fix data path in multi gpu notebook

edknv · edknv · commit 078822d1d421 · 2023-06-07T21:10:02.000-07:00
diff --git a/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb b/examples/02-Multi-GPU-Tensorflow-with-Horovod.ipynb
@@ -78,8 +78,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s]                                                                                                                         \n",
-      "unzipping files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:04<00:00,  1.60files/s]\n"
+      "downloading ml-25m.zip: 262MB [00:10, 24.4MB/s]                                     \n",
+      "unzipping files: 100%|█████████████████████████████| 8/8 [00:04<00:00,  1.61files/s]\n"
      ]
     }
    ],
@@ -155,6 +155,7 @@
    "source": [
     "%%writefile \"./tf_trainer.py\"\n",
     "\n",
+    "import argparse\n",
     "import os\n",
     "\n",
     "# the order of statements and imports is imoportant\n",
@@ -180,15 +181,20 @@
     "\n",
     "from merlin.loader.tensorflow import Loader\n",
     "\n",
+    "parser = argparse.ArgumentParser()\n",
+    "parser.add_argument(\"--data_path\", default=None, help=\"Input directory.\")\n",
+    "parser.add_argument(\"--batch_size\", default=None, help=\"Batch size.\")\n",
+    "args = parser.parse_args()\n",
     "\n",
-    "DATA_PATH = os.getenv(\"DATA_PATH\", os.path.expanduser(\"~/workspace\"))\n",
+    "DATA_PATH = args.data_path or os.path.expanduser(\"~/workspace\")\n",
+    "BATCH_SIZE = args.batch_size or 1024\n",
     "\n",
     "dataset = Dataset(os.path.join(DATA_PATH, \"train.parquet\"))\n",
     "dataset = dataset.repartition(MPI_SIZE)\n",
     "\n",
     "loader = Loader(\n",
     "    dataset,\n",
-    "    batch_size=64 * 1024,\n",
+    "    batch_size=BATCH_SIZE,\n",
     "    global_size=MPI_SIZE,\n",
     "    global_rank=MPI_RANK,\n",
     "    device=MPI_RANK,\n",
@@ -280,7 +286,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": null,
    "id": "ec5e9b7f",
    "metadata": {
     "scrolled": true
@@ -290,58 +296,49 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "2023-06-03 21:35:18.892140: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "2023-06-03 21:35:18.932879: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:23.549563: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:23.568539: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:23.592349: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "[1,1]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:23.609861: I tensorflow/core/platform/cpu_feature_guard.cc:183] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
-      "[1,0]<stderr>:To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:28.092241: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,1]<stderr>:2023-06-03 21:35:28.092336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24337 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
-      "[1,0]<stderr>:2023-06-03 21:35:28.141988: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:226] Using CUDA malloc Async allocator for GPU: 0\n",
-      "[1,0]<stderr>:2023-06-03 21:35:28.142076: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1638] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24338 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
-      "[1,0]<stderr>:2023-06-03 21:35:32.089463: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f1150020480 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
-      "[1,0]<stderr>:2023-06-03 21:35:32.089532: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
-      "[1,1]<stderr>:2023-06-03 21:35:32.089552: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x79f8590 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n",
-      "[1,1]<stderr>:2023-06-03 21:35:32.089613: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6\n",
-      "[1,1]<stderr>:2023-06-03 21:35:32.101885: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:32.102268: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.\n",
-      "[1,1]<stderr>:2023-06-03 21:35:33.637854: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
-      "[1,0]<stderr>:2023-06-03 21:35:33.648275: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8900\n",
-      "[1,1]<stderr>:2023-06-03 21:35:33.834015: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
-      "[1,0]<stderr>:2023-06-03 21:35:33.854743: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.\n",
-      "[1,0]<stdout>:Step #0\tLoss: 13.976147\n",
-      "[1,0]<stdout>:Step #10\tLoss: 13.746956\n",
-      "[1,0]<stdout>:Step #20\tLoss: 13.907515\n",
-      "[1,0]<stdout>:Step #30\tLoss: 14.084653\n",
-      "[1,0]<stdout>:Step #40\tLoss: 13.346972\n",
-      "[1,0]<stdout>:Step #50\tLoss: 13.931261\n",
-      "[1,0]<stdout>:Step #60\tLoss: 13.707795\n",
-      "[1,0]<stdout>:Step #70\tLoss: 13.510033\n",
-      "[1,0]<stdout>:Step #80\tLoss: 13.372274\n",
-      "[1,0]<stdout>:Step #90\tLoss: 13.713926\n",
-      "[1,0]<stdout>:Step #100\tLoss: 13.236437\n",
-      "[1,0]<stdout>:Step #110\tLoss: 13.265822\n",
-      "[1,0]<stdout>:Step #120\tLoss: 13.991277\n",
-      "[1,0]<stdout>:Step #130\tLoss: 14.069466\n",
-      "[1,0]<stdout>:Step #140\tLoss: 13.635876\n",
-      "[1,0]<stdout>:Step #150\tLoss: 13.416016\n",
-      "[1,0]<stdout>:Step #160\tLoss: 13.216636\n",
-      "[1,0]<stdout>:Step #170\tLoss: 12.776440\n",
-      "[1,0]<stdout>:Step #180\tLoss: 13.570569\n",
-      "[1,0]<stdout>:Step #190\tLoss: 13.868576\n",
-      "[1,1]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
-      "[1,1]<stderr>:  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n",
-      "[1,0]<stderr>:/usr/local/lib/python3.8/dist-packages/merlin/dtypes/mappings/torch.py:43: UserWarning: PyTorch dtype mappings did not load successfully due to an error: No module named 'torch'\n",
-      "[1,0]<stderr>:  warn(f\"PyTorch dtype mappings did not load successfully due to an error: {exc.msg}\")\n"
+      "2023-06-08 04:04:31.525132: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
+      "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2023-06-08 04:04:31.640485: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,0]<stderr>:2023-06-08 04:04:34.931845: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
+      "[1,0]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,1]<stderr>:2023-06-08 04:04:34.939388: I tensorflow/core/platform/cpu_feature_guard.cc:194] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX\n",
+      "[1,1]<stderr>:To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "[1,1]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,0]<stderr>:2023-06-08 04:04:35.046788: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "[1,1]<stderr>:2023-06-08 04:04:41.286722: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
+      "[1,1]<stderr>:2023-06-08 04:04:41.286821: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:b3:00.0, compute capability: 8.6\n",
+      "[1,0]<stderr>:2023-06-08 04:04:41.292086: I tensorflow/core/common_runtime/gpu/gpu_process_state.cc:222] Using CUDA malloc Async allocator for GPU: 0\n",
+      "[1,0]<stderr>:2023-06-08 04:04:41.292173: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1621] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 24570 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6\n",
+      "[1,1]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
+      "[1,1]<stderr>:Instructions for updating:\n",
+      "[1,1]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
+      "[1,0]<stderr>:WARNING:tensorflow:From /usr/local/lib/python3.8/dist-packages/tensorflow/python/autograph/pyct/static_analysis/liveness.py:83: Analyzer.lamba_check (from tensorflow.python.autograph.pyct.static_analysis.liveness) is deprecated and will be removed after 2023-09-23.\n",
+      "[1,0]<stderr>:Instructions for updating:\n",
+      "[1,0]<stderr>:Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089\n",
+      "[1,0]<stdout>:Step #0\tLoss: 13.976286\n",
+      "[1,0]<stdout>:Step #10\tLoss: 13.746111\n",
+      "[1,0]<stdout>:Step #20\tLoss: 13.905323\n",
+      "[1,0]<stdout>:Step #30\tLoss: 14.093473\n",
+      "[1,0]<stdout>:Step #40\tLoss: 13.336206\n",
+      "[1,0]<stdout>:Step #50\tLoss: 13.932583\n",
+      "[1,0]<stdout>:Step #60\tLoss: 13.702780\n",
+      "[1,0]<stdout>:Step #70\tLoss: 13.522057\n",
+      "[1,0]<stdout>:Step #80\tLoss: 13.382860\n",
+      "[1,0]<stdout>:Step #90\tLoss: 13.701270\n",
+      "[1,0]<stdout>:Step #100\tLoss: 13.240610\n",
+      "[1,0]<stdout>:Step #110\tLoss: 13.264977\n",
+      "[1,0]<stdout>:Step #120\tLoss: 13.984927\n",
+      "[1,0]<stdout>:Step #130\tLoss: 14.039978\n",
+      "[1,0]<stdout>:Step #140\tLoss: 13.639907\n",
+      "[1,0]<stdout>:Step #150\tLoss: 13.430090\n",
+      "[1,0]<stdout>:Step #160\tLoss: 13.219415\n",
+      "[1,0]<stdout>:Step #170\tLoss: 12.758451\n",
+      "[1,0]<stdout>:Step #180\tLoss: 13.592442\n"
      ]
     }
    ],
    "source": [
-    "!horovodrun -np {GPU_COUNT} python tf_trainer.py"
+    "!horovodrun -np {GPU_COUNT} python tf_trainer.py --data_path={DATA_PATH} --batch_size=65536"
    ]
   },
   {
diff --git a/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py b/tests/examples/test_multi_GPU_with_horovod_and_tensorflow.py
@@ -56,6 +56,6 @@ def test_getting_started_tensorflow(tb, tmpdir):
         process.wait()
         stdout, stderr = process.communicate()
         print(stdout, stderr)
-        assert "Loss" in str(stdout)
+        assert "Loss:" in str(stdout)
 
     assert any(f.startswith("checkpoints-") for f in os.listdir(os.getcwd()))